Qvac_genesis_i / app.py
Javedalam's picture
Create app.py
55dbe5b verified
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
MODEL_ID = "qvac/genesis-i-model"
# ----------------------
# Load tokenizer & model
# ----------------------
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Make sure we have a pad token
if tokenizer.pad_token is None and tokenizer.eos_token is not None:
tokenizer.pad_token = tokenizer.eos_token
print("Detecting device & dtype...")
if torch.cuda.is_available():
# On ZeroGPU / real GPU: use bf16 if supported, else fp16
try:
bf16_ok = torch.cuda.is_bf16_supported()
except AttributeError:
bf16_ok = False
torch_dtype = torch.bfloat16 if bf16_ok else torch.float16
device_map = "auto"
else:
# CPU fallback
torch_dtype = torch.float32
device_map = "cpu"
print(f"Loading model on {device_map} with dtype={torch_dtype}...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch_dtype,
device_map=device_map,
)
model.eval()
# ----------------------
# Helper: build chat input
# ----------------------
def build_inputs(prompt: str):
"""
Build input_ids using the model's chat_template.
We give it a simple system + user conversation and
ask tokenizer to add the assistant generation prompt.
"""
messages = [
{
"role": "system",
"content": (
"You are an educational AI tutor. "
"Explain clearly and precisely, focusing on math, science, "
"engineering, programming, and medical education. "
"Show intermediate steps when useful, but avoid rambling."
),
},
{
"role": "user",
"content": prompt,
},
]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
)
return input_ids.to(model.device)
# ----------------------
# Generation function
# ----------------------
def generate(
prompt: str,
temperature: float = 0.7,
top_p: float = 0.9,
max_new_tokens: int = 256,
):
if not prompt.strip():
return "Please enter a prompt."
input_ids = build_inputs(prompt)
with torch.no_grad():
output_ids = model.generate(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
repetition_penalty=1.1, # light anti-repetition
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Keep only the newly generated tokens (assistant part)
new_tokens = output_ids[0, input_ids.shape[-1]:]
text = tokenizer.decode(new_tokens, skip_special_tokens=True)
text = text.strip()
if not text:
text = "[Empty response]"
return text
# ----------------------
# Gradio UI
# ----------------------
with gr.Blocks() as demo:
gr.Markdown(
"""
# QVAC Genesis I – Educational LLM Demo
Model: **qvac/genesis-i-model**
Trained on the QVAC Genesis I synthetic educational dataset (STEM-heavy).
Ask it math, science, engineering, or medical education questions.
"""
)
with gr.Row():
with gr.Column(scale=3):
prompt = gr.Textbox(
label="Prompt",
placeholder="Example: Explain why 2 + 2 = 4 in a way a 10-year-old can understand.",
lines=6,
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.2,
value=0.7,
step=0.05,
label="Temperature (creativity)",
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p (nucleus sampling)",
)
max_new_tokens = gr.Slider(
minimum=16,
maximum=512,
value=256,
step=16,
label="Max new tokens",
)
submit = gr.Button("Generate")
with gr.Column(scale=4):
output = gr.Textbox(
label="Model output",
lines=18,
)
submit.click(
fn=generate,
inputs=[prompt, temperature, top_p, max_new_tokens],
outputs=output,
)
# Press Enter in the prompt box to generate
prompt.submit(
fn=generate,
inputs=[prompt, temperature, top_p, max_new_tokens],
outputs=output,
)
demo.queue().launch()