astegaras commited on
Commit
251dafb
·
verified ·
1 Parent(s): 7b026ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -28
app.py CHANGED
@@ -1,52 +1,57 @@
1
  import gradio as gr
2
- from mlx_lm import load, generate
 
3
 
4
  # ----------------------------------------------------
5
- # 1. Load your quantized MLX model from HuggingFace
6
  # ----------------------------------------------------
7
- MODEL_REPO = "astegaras/my-mlx-llama3" # <-- change to your repo
8
 
9
- print("Loading model...")
10
- model, tokenizer = load(MODEL_REPO)
11
- print("Model loaded!")
 
 
 
12
 
13
  # ----------------------------------------------------
14
- # 2. Chat / inference function
15
  # ----------------------------------------------------
16
- def respond(user_input, history):
17
- """
18
- user_input: new user message
19
- history: list of [user, assistant] messages from Gradio
20
- """
21
 
22
- # Build a conversation prompt (simple version)
23
- messages = []
24
- for user_msg, assistant_msg in history:
25
- messages.append(f"User: {user_msg}\nAssistant: {assistant_msg}")
26
- messages.append(f"User: {user_input}\nAssistant:")
 
 
 
 
 
 
27
 
28
- prompt = "\n".join(messages)
 
 
 
29
 
30
- # Generate with mlx_lm
31
- output = generate(
32
- model,
33
- tokenizer,
34
  prompt,
35
  max_tokens=256,
36
  temperature=0.7,
37
  top_p=0.9,
 
38
  )
39
 
40
- # Extract only the assistant's new text
41
- assistant_reply = output[len(prompt):].strip()
42
-
43
  return assistant_reply
44
 
45
  # ----------------------------------------------------
46
- # 3. Launch Gradio chat interface
47
  # ----------------------------------------------------
48
  gr.ChatInterface(
49
  fn=respond,
50
- title="My MLX Llama Model",
51
- description="Chat with your fine-tuned MLX model!",
52
  ).launch()
 
 
1
  import gradio as gr
2
+ from huggingface_hub import hf_hub_download
3
+ from llama_cpp import Llama
4
 
5
  # ----------------------------------------------------
6
+ # 1. Download GGUF model from HuggingFace
7
  # ----------------------------------------------------
 
8
 
9
+ REPO_ID = "astegaras/merged_kaggle" # your GGUF repo
10
+ FILENAME = "model-q4_k_m.gguf" # your GGUF file
11
+
12
+ print("Downloading GGUF model...")
13
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
14
+ print("Model downloaded:", model_path)
15
 
16
  # ----------------------------------------------------
17
+ # 2. Load llama.cpp model
18
  # ----------------------------------------------------
 
 
 
 
 
19
 
20
+ llm = Llama(
21
+ model_path=model_path,
22
+ n_ctx=4096, # context size
23
+ n_threads=8, # use HF Space CPU
24
+ )
25
+
26
+ # ----------------------------------------------------
27
+ # 3. Chat / inference function
28
+ # ----------------------------------------------------
29
+ def respond(message, history):
30
+ prompt = ""
31
 
32
+ # Build prompt manually
33
+ for user_msg, bot_msg in history:
34
+ prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
35
+ prompt += f"User: {message}\nAssistant:"
36
 
37
+ # Generate response
38
+ output = llm(
 
 
39
  prompt,
40
  max_tokens=256,
41
  temperature=0.7,
42
  top_p=0.9,
43
+ stop=["User:", "Assistant:"]
44
  )
45
 
46
+ assistant_reply = output["choices"][0]["text"].strip()
 
 
47
  return assistant_reply
48
 
49
  # ----------------------------------------------------
50
+ # 4. Launch Gradio Chat Interface
51
  # ----------------------------------------------------
52
  gr.ChatInterface(
53
  fn=respond,
54
+ title="My Llama.cpp GGUF Model",
55
+ description="Chat with your fine-tuned GGUF model!",
56
  ).launch()
57
+