astegaras commited on
Commit
62f86f8
·
verified ·
1 Parent(s): 442c9db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
- # Download GGUF to local file
6
  model_path = hf_hub_download(
7
  repo_id="astegaras/Llama3.2_3B",
8
  filename="model-Q2_K.gguf"
@@ -12,22 +11,29 @@ llm = Llama(
12
  model_path=model_path,
13
  n_ctx=2048,
14
  n_gpu_layers=0,
15
- chat_format=None, # <-- CRITICAL: disable chat templates
 
 
16
  )
17
 
18
  def respond(user_question):
19
- # Format prompt exactly like your training data
 
 
 
 
 
20
  prompt = f"Q: {user_question}\nA:"
21
-
22
  out = llm.create_completion(
23
  prompt=prompt,
24
  max_tokens=256,
25
  temperature=0.7,
26
  top_p=0.9,
 
27
  )
28
-
29
- # Return the generated answer
30
  return out["choices"][0]["text"]
31
 
32
  gr.Interface(fn=respond, inputs="text", outputs="text").launch()
33
 
 
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
 
5
  model_path = hf_hub_download(
6
  repo_id="astegaras/Llama3.2_3B",
7
  filename="model-Q2_K.gguf"
 
11
  model_path=model_path,
12
  n_ctx=2048,
13
  n_gpu_layers=0,
14
+ chat_format=None, # <-- ABSOLUTELY REQUIRED
15
+ add_bos_token=False, # <-- REQUIRED
16
+ add_eos_token=False, # <-- REQUIRED
17
  )
18
 
19
  def respond(user_question):
20
+
21
+ # sanitize input for your tokenizer
22
+ user_question = user_question.replace("\r", " ").replace("\n", " ")
23
+ user_question = user_question.encode("ascii", "ignore").decode()
24
+
25
+ # match your fine-tuning format exactly
26
  prompt = f"Q: {user_question}\nA:"
27
+
28
  out = llm.create_completion(
29
  prompt=prompt,
30
  max_tokens=256,
31
  temperature=0.7,
32
  top_p=0.9,
33
+ stop=["Q:"], # safety
34
  )
 
 
35
  return out["choices"][0]["text"]
36
 
37
  gr.Interface(fn=respond, inputs="text", outputs="text").launch()
38
 
39
+