iris / app.py
astegaras's picture
Update app.py
251dafb verified
raw
history blame
1.67 kB
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# ----------------------------------------------------
# 1. Download GGUF model from HuggingFace
# ----------------------------------------------------
REPO_ID = "astegaras/merged_kaggle" # your GGUF repo
FILENAME = "model-q4_k_m.gguf" # your GGUF file
print("Downloading GGUF model...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
print("Model downloaded:", model_path)
# ----------------------------------------------------
# 2. Load llama.cpp model
# ----------------------------------------------------
llm = Llama(
model_path=model_path,
n_ctx=4096, # context size
n_threads=8, # use HF Space CPU
)
# ----------------------------------------------------
# 3. Chat / inference function
# ----------------------------------------------------
def respond(message, history):
prompt = ""
# Build prompt manually
for user_msg, bot_msg in history:
prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
prompt += f"User: {message}\nAssistant:"
# Generate response
output = llm(
prompt,
max_tokens=256,
temperature=0.7,
top_p=0.9,
stop=["User:", "Assistant:"]
)
assistant_reply = output["choices"][0]["text"].strip()
return assistant_reply
# ----------------------------------------------------
# 4. Launch Gradio Chat Interface
# ----------------------------------------------------
gr.ChatInterface(
fn=respond,
title="My Llama.cpp GGUF Model",
description="Chat with your fine-tuned GGUF model!",
).launch()