import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr

MODEL_ID = "qvac/genesis-i-model"

# ----------------------
# Load tokenizer & model
# ----------------------
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Make sure we have a pad token
if tokenizer.pad_token is None and tokenizer.eos_token is not None:
    tokenizer.pad_token = tokenizer.eos_token

print("Detecting device & dtype...")
if torch.cuda.is_available():
    # On ZeroGPU / real GPU: use bf16 if supported, else fp16
    try:
        bf16_ok = torch.cuda.is_bf16_supported()
    except AttributeError:
        bf16_ok = False

    torch_dtype = torch.bfloat16 if bf16_ok else torch.float16
    device_map = "auto"
else:
    # CPU fallback
    torch_dtype = torch.float32
    device_map = "cpu"

print(f"Loading model on {device_map} with dtype={torch_dtype}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch_dtype,
    device_map=device_map,
)
model.eval()

# ----------------------
# Helper: build chat input
# ----------------------
def build_inputs(prompt: str):
    """
    Build input_ids using the model's chat_template.
    We give it a simple system + user conversation and
    ask tokenizer to add the assistant generation prompt.
    """
    messages = [
        {
            "role": "system",
            "content": (
                "You are an educational AI tutor. "
                "Explain clearly and precisely, focusing on math, science, "
                "engineering, programming, and medical education. "
                "Show intermediate steps when useful, but avoid rambling."
            ),
        },
        {
            "role": "user",
            "content": prompt,
        },
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    )

    return input_ids.to(model.device)

# ----------------------
# Generation function
# ----------------------
def generate(
    prompt: str,
    temperature: float = 0.7,
    top_p: float = 0.9,
    max_new_tokens: int = 256,
):
    if not prompt.strip():
        return "Please enter a prompt."

    input_ids = build_inputs(prompt)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=1.1,  # light anti-repetition
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Keep only the newly generated tokens (assistant part)
    new_tokens = output_ids[0, input_ids.shape[-1]:]
    text = tokenizer.decode(new_tokens, skip_special_tokens=True)
    text = text.strip()

    if not text:
        text = "[Empty response]"
    return text

# ----------------------
# Gradio UI
# ----------------------
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # QVAC Genesis I – Educational LLM Demo

        Model: **qvac/genesis-i-model**  
        Trained on the QVAC Genesis I synthetic educational dataset (STEM-heavy).  
        Ask it math, science, engineering, or medical education questions.
        """
    )

    with gr.Row():
        with gr.Column(scale=3):
            prompt = gr.Textbox(
                label="Prompt",
                placeholder="Example: Explain why 2 + 2 = 4 in a way a 10-year-old can understand.",
                lines=6,
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=1.2,
                value=0.7,
                step=0.05,
                label="Temperature (creativity)",
            )
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="Top-p (nucleus sampling)",
            )
            max_new_tokens = gr.Slider(
                minimum=16,
                maximum=512,
                value=256,
                step=16,
                label="Max new tokens",
            )
            submit = gr.Button("Generate")

        with gr.Column(scale=4):
            output = gr.Textbox(
                label="Model output",
                lines=18,
            )

    submit.click(
        fn=generate,
        inputs=[prompt, temperature, top_p, max_new_tokens],
        outputs=output,
    )

    # Press Enter in the prompt box to generate
    prompt.submit(
        fn=generate,
        inputs=[prompt, temperature, top_p, max_new_tokens],
        outputs=output,
    )

demo.queue().launch()