Spaces:

astegaras
/

iris

Sleeping

iris / app.py

Update app.py

251dafb verified 22 days ago

1.67 kB

	import gradio as gr
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	# ----------------------------------------------------
	# 1. Download GGUF model from HuggingFace
	# ----------------------------------------------------

	REPO_ID = "astegaras/merged_kaggle" # your GGUF repo
	FILENAME = "model-q4_k_m.gguf" # your GGUF file

	print("Downloading GGUF model...")
	model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
	print("Model downloaded:", model_path)

	# ----------------------------------------------------
	# 2. Load llama.cpp model
	# ----------------------------------------------------

	llm = Llama(
	model_path=model_path,
	n_ctx=4096, # context size
	n_threads=8, # use HF Space CPU
	)

	# ----------------------------------------------------
	# 3. Chat / inference function
	# ----------------------------------------------------
	def respond(message, history):
	prompt = ""

	# Build prompt manually
	for user_msg, bot_msg in history:
	prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
	prompt += f"User: {message}\nAssistant:"

	# Generate response
	output = llm(
	prompt,
	max_tokens=256,
	temperature=0.7,
	top_p=0.9,
	stop=["User:", "Assistant:"]
	)

	assistant_reply = output["choices"][0]["text"].strip()
	return assistant_reply

	# ----------------------------------------------------
	# 4. Launch Gradio Chat Interface
	# ----------------------------------------------------
	gr.ChatInterface(
	fn=respond,
	title="My Llama.cpp GGUF Model",
	description="Chat with your fine-tuned GGUF model!",
	).launch()