Spaces:

astegaras
/

iris

Sleeping

astegaras commited on 26 days ago

Commit

62f86f8

verified ·

1 Parent(s): 442c9db

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-# Download GGUF to local file
 model_path = hf_hub_download(
     repo_id="astegaras/Llama3.2_3B",
     filename="model-Q2_K.gguf"
@@ -12,22 +11,29 @@ llm = Llama(
     model_path=model_path,
     n_ctx=2048,
     n_gpu_layers=0,
-    chat_format=None,   # <-- CRITICAL: disable chat templates
 )
 def respond(user_question):
-    # Format prompt exactly like your training data
     prompt = f"Q: {user_question}\nA:"
     out = llm.create_completion(
         prompt=prompt,
         max_tokens=256,
         temperature=0.7,
         top_p=0.9,
     )
-    # Return the generated answer
     return out["choices"][0]["text"]
 gr.Interface(fn=respond, inputs="text", outputs="text").launch()

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 model_path = hf_hub_download(
     repo_id="astegaras/Llama3.2_3B",
     filename="model-Q2_K.gguf"
     model_path=model_path,
     n_ctx=2048,
     n_gpu_layers=0,
+    chat_format=None,     # <-- ABSOLUTELY REQUIRED
+    add_bos_token=False,  # <-- REQUIRED
+    add_eos_token=False,  # <-- REQUIRED
 )
 def respond(user_question):
+    # sanitize input for your tokenizer
+    user_question = user_question.replace("\r", " ").replace("\n", " ")
+    user_question = user_question.encode("ascii", "ignore").decode()
+    # match your fine-tuning format exactly
     prompt = f"Q: {user_question}\nA:"
     out = llm.create_completion(
         prompt=prompt,
         max_tokens=256,
         temperature=0.7,
         top_p=0.9,
+        stop=["Q:"],   # safety
     )
     return out["choices"][0]["text"]
 gr.Interface(fn=respond, inputs="text", outputs="text").launch()