Spaces:

astegaras
/

iris

Sleeping

App Files Files Community

astegaras commited on 20 days ago

Commit

251dafb

verified ·

1 Parent(s): 7b026ee

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -28

app.py CHANGED Viewed

@@ -1,52 +1,57 @@
 import gradio as gr
-from mlx_lm import load, generate
 # ----------------------------------------------------
-# 1. Load your quantized MLX model from HuggingFace
 # ----------------------------------------------------
-MODEL_REPO = "astegaras/my-mlx-llama3"   # <-- change to your repo
-print("Loading model...")
-model, tokenizer = load(MODEL_REPO)
-print("Model loaded!")
 # ----------------------------------------------------
-# 2. Chat / inference function
 # ----------------------------------------------------
-def respond(user_input, history):
-    """
-    user_input: new user message
-    history: list of [user, assistant] messages from Gradio
-    """
-    # Build a conversation prompt (simple version)
-    messages = []
-    for user_msg, assistant_msg in history:
-        messages.append(f"User: {user_msg}\nAssistant: {assistant_msg}")
-    messages.append(f"User: {user_input}\nAssistant:")
-    prompt = "\n".join(messages)
-    # Generate with mlx_lm
-    output = generate(
-        model,
-        tokenizer,
         prompt,
         max_tokens=256,
         temperature=0.7,
         top_p=0.9,
     )
-    # Extract only the assistant's new text
-    assistant_reply = output[len(prompt):].strip()
     return assistant_reply
 # ----------------------------------------------------
-# 3. Launch Gradio chat interface
 # ----------------------------------------------------
 gr.ChatInterface(
     fn=respond,
-    title="My MLX Llama Model",
-    description="Chat with your fine-tuned MLX model!",
 ).launch()

 import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 # ----------------------------------------------------
+# 1. Download GGUF model from HuggingFace
 # ----------------------------------------------------
+REPO_ID = "astegaras/merged_kaggle"   # your GGUF repo
+FILENAME = "model-q4_k_m.gguf"        # your GGUF file
+print("Downloading GGUF model...")
+model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
+print("Model downloaded:", model_path)
 # ----------------------------------------------------
+# 2. Load llama.cpp model
 # ----------------------------------------------------
+llm = Llama(
+    model_path=model_path,
+    n_ctx=4096,           # context size
+    n_threads=8,          # use HF Space CPU
+)
+# ----------------------------------------------------
+# 3. Chat / inference function
+# ----------------------------------------------------
+def respond(message, history):
+    prompt = ""
+    # Build prompt manually
+    for user_msg, bot_msg in history:
+        prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
+    prompt += f"User: {message}\nAssistant:"
+    # Generate response
+    output = llm(
         prompt,
         max_tokens=256,
         temperature=0.7,
         top_p=0.9,
+        stop=["User:", "Assistant:"]
     )
+    assistant_reply = output["choices"][0]["text"].strip()
     return assistant_reply
 # ----------------------------------------------------
+# 4. Launch Gradio Chat Interface
 # ----------------------------------------------------
 gr.ChatInterface(
     fn=respond,
+    title="My Llama.cpp GGUF Model",
+    description="Chat with your fine-tuned GGUF model!",
 ).launch()