import torch from transformers import AutoModelForCausalLM, AutoTokenizer import gradio as gr MODEL_ID = "qvac/genesis-i-model" # ---------------------- # Load tokenizer & model # ---------------------- print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) # Make sure we have a pad token if tokenizer.pad_token is None and tokenizer.eos_token is not None: tokenizer.pad_token = tokenizer.eos_token print("Detecting device & dtype...") if torch.cuda.is_available(): # On ZeroGPU / real GPU: use bf16 if supported, else fp16 try: bf16_ok = torch.cuda.is_bf16_supported() except AttributeError: bf16_ok = False torch_dtype = torch.bfloat16 if bf16_ok else torch.float16 device_map = "auto" else: # CPU fallback torch_dtype = torch.float32 device_map = "cpu" print(f"Loading model on {device_map} with dtype={torch_dtype}...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch_dtype, device_map=device_map, ) model.eval() # ---------------------- # Helper: build chat input # ---------------------- def build_inputs(prompt: str): """ Build input_ids using the model's chat_template. We give it a simple system + user conversation and ask tokenizer to add the assistant generation prompt. """ messages = [ { "role": "system", "content": ( "You are an educational AI tutor. " "Explain clearly and precisely, focusing on math, science, " "engineering, programming, and medical education. " "Show intermediate steps when useful, but avoid rambling." ), }, { "role": "user", "content": prompt, }, ] input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt", ) return input_ids.to(model.device) # ---------------------- # Generation function # ---------------------- def generate( prompt: str, temperature: float = 0.7, top_p: float = 0.9, max_new_tokens: int = 256, ): if not prompt.strip(): return "Please enter a prompt." input_ids = build_inputs(prompt) with torch.no_grad(): output_ids = model.generate( input_ids=input_ids, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, repetition_penalty=1.1, # light anti-repetition pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) # Keep only the newly generated tokens (assistant part) new_tokens = output_ids[0, input_ids.shape[-1]:] text = tokenizer.decode(new_tokens, skip_special_tokens=True) text = text.strip() if not text: text = "[Empty response]" return text # ---------------------- # Gradio UI # ---------------------- with gr.Blocks() as demo: gr.Markdown( """ # QVAC Genesis I – Educational LLM Demo Model: **qvac/genesis-i-model** Trained on the QVAC Genesis I synthetic educational dataset (STEM-heavy). Ask it math, science, engineering, or medical education questions. """ ) with gr.Row(): with gr.Column(scale=3): prompt = gr.Textbox( label="Prompt", placeholder="Example: Explain why 2 + 2 = 4 in a way a 10-year-old can understand.", lines=6, ) temperature = gr.Slider( minimum=0.1, maximum=1.2, value=0.7, step=0.05, label="Temperature (creativity)", ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)", ) max_new_tokens = gr.Slider( minimum=16, maximum=512, value=256, step=16, label="Max new tokens", ) submit = gr.Button("Generate") with gr.Column(scale=4): output = gr.Textbox( label="Model output", lines=18, ) submit.click( fn=generate, inputs=[prompt, temperature, top_p, max_new_tokens], outputs=output, ) # Press Enter in the prompt box to generate prompt.submit( fn=generate, inputs=[prompt, temperature, top_p, max_new_tokens], outputs=output, ) demo.queue().launch()