Spaces:
Sleeping
Sleeping
File size: 4,756 Bytes
55dbe5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
MODEL_ID = "qvac/genesis-i-model"
# ----------------------
# Load tokenizer & model
# ----------------------
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Make sure we have a pad token
if tokenizer.pad_token is None and tokenizer.eos_token is not None:
tokenizer.pad_token = tokenizer.eos_token
print("Detecting device & dtype...")
if torch.cuda.is_available():
# On ZeroGPU / real GPU: use bf16 if supported, else fp16
try:
bf16_ok = torch.cuda.is_bf16_supported()
except AttributeError:
bf16_ok = False
torch_dtype = torch.bfloat16 if bf16_ok else torch.float16
device_map = "auto"
else:
# CPU fallback
torch_dtype = torch.float32
device_map = "cpu"
print(f"Loading model on {device_map} with dtype={torch_dtype}...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch_dtype,
device_map=device_map,
)
model.eval()
# ----------------------
# Helper: build chat input
# ----------------------
def build_inputs(prompt: str):
"""
Build input_ids using the model's chat_template.
We give it a simple system + user conversation and
ask tokenizer to add the assistant generation prompt.
"""
messages = [
{
"role": "system",
"content": (
"You are an educational AI tutor. "
"Explain clearly and precisely, focusing on math, science, "
"engineering, programming, and medical education. "
"Show intermediate steps when useful, but avoid rambling."
),
},
{
"role": "user",
"content": prompt,
},
]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
)
return input_ids.to(model.device)
# ----------------------
# Generation function
# ----------------------
def generate(
prompt: str,
temperature: float = 0.7,
top_p: float = 0.9,
max_new_tokens: int = 256,
):
if not prompt.strip():
return "Please enter a prompt."
input_ids = build_inputs(prompt)
with torch.no_grad():
output_ids = model.generate(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
repetition_penalty=1.1, # light anti-repetition
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Keep only the newly generated tokens (assistant part)
new_tokens = output_ids[0, input_ids.shape[-1]:]
text = tokenizer.decode(new_tokens, skip_special_tokens=True)
text = text.strip()
if not text:
text = "[Empty response]"
return text
# ----------------------
# Gradio UI
# ----------------------
with gr.Blocks() as demo:
gr.Markdown(
"""
# QVAC Genesis I – Educational LLM Demo
Model: **qvac/genesis-i-model**
Trained on the QVAC Genesis I synthetic educational dataset (STEM-heavy).
Ask it math, science, engineering, or medical education questions.
"""
)
with gr.Row():
with gr.Column(scale=3):
prompt = gr.Textbox(
label="Prompt",
placeholder="Example: Explain why 2 + 2 = 4 in a way a 10-year-old can understand.",
lines=6,
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.2,
value=0.7,
step=0.05,
label="Temperature (creativity)",
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p (nucleus sampling)",
)
max_new_tokens = gr.Slider(
minimum=16,
maximum=512,
value=256,
step=16,
label="Max new tokens",
)
submit = gr.Button("Generate")
with gr.Column(scale=4):
output = gr.Textbox(
label="Model output",
lines=18,
)
submit.click(
fn=generate,
inputs=[prompt, temperature, top_p, max_new_tokens],
outputs=output,
)
# Press Enter in the prompt box to generate
prompt.submit(
fn=generate,
inputs=[prompt, temperature, top_p, max_new_tokens],
outputs=output,
)
demo.queue().launch() |