""" Gradio app for running QTSplus on Hugging Face Spaces. This follows the inference example in README.md and uses the `AlpachinoNLP/QTSplus-3B` Hugging Face model. """ from __future__ import annotations import spaces import os import sys from typing import Optional, List, Tuple os.system('pip3 install transformers==4.57.1 huggingface_hub>=1.01 av qwen-vl-utils accelerate sentencepiece bitsandbytes -U') os.system('pip3 install torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu128 -U') import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoProcessor from typing import Optional, List, Tuple # Ensure project root (which contains `src/`) is on PYTHONPATH so we can # reuse the local vision processing utilities instead of relying on # external `qwen_vl_utils`. PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if PROJECT_ROOT not in sys.path: sys.path.insert(0, PROJECT_ROOT) from qwen_vl_utils import process_vision_info DEFAULT_MODEL_ID = os.environ.get("QTSPLUS_MODEL_ID", "AlpachinoNLP/QTSplus-3B") DEFAULT_QUESTION = "What is happening in the video?" USE_CUDA = True # set to False if you ever run this Space on CPU-only hardware DEVICE = "cuda" if USE_CUDA else "cpu" DTYPE = torch.bfloat16 _MODEL: Optional[AutoModelForCausalLM] = None _PROCESSOR: Optional[AutoProcessor] = None def load_model_and_processor() -> Tuple[AutoModelForCausalLM, AutoProcessor]: """Lazy-load the QTSplus model and processor.""" global _MODEL, _PROCESSOR if _MODEL is not None and _PROCESSOR is not None: return _MODEL, _PROCESSOR model_id = DEFAULT_MODEL_ID model_kwargs = {"trust_remote_code": True, "torch_dtype": DTYPE} # if DEVICE == "cuda": # # Let Transformers place layers automatically on the available GPU. # model_kwargs["device_map"] = "auto" _MODEL = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) _MODEL.eval() _MODEL.to(DEVICE) _PROCESSOR = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Ensure generation uses explicit pad/eos ids. tok = getattr(_PROCESSOR, "tokenizer", None) if tok is not None: if getattr(_MODEL.config, "pad_token_id", None) is None: _MODEL.config.pad_token_id = tok.pad_token_id or tok.eos_token_id if getattr(_MODEL.config, "eos_token_id", None) is None: _MODEL.config.eos_token_id = tok.eos_token_id if hasattr(_MODEL, "generation_config") and _MODEL.generation_config is not None: _MODEL.generation_config.pad_token_id = _MODEL.config.pad_token_id _MODEL.generation_config.eos_token_id = _MODEL.config.eos_token_id return _MODEL, _PROCESSOR # # Preload model and processor at import time (for faster first inference). load_model_and_processor() def build_messages(video: Optional[str], prompt: str) -> List[dict]: """Build chat-style messages for a single video + question.""" if not video: raise ValueError("Please upload a video before running the model.") return [ { "role": "user", "content": [ {"type": "video", "video": video, "max_pixels": 360 * 420, "fps": 1.0}, {"type": "text", "text": prompt or DEFAULT_QUESTION}, ], } ] @spaces.GPU(duration=120) def qtsplus_generate(video_path: Optional[str], question: str, max_new_tokens: int = 256) -> str: """Run QTSplus on the given video and question.""" if not video_path: return "Please upload a video first." video_path.replace("file://","") model, processor = load_model_and_processor() messages = build_messages(video_path, question) text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", **(video_kwargs or {}), ) # Move inputs to the correct device and dtype. inputs = inputs.to(dtype=DTYPE, device=DEVICE) # Extract vision tensors for QTSplus-specific `vision_input` argument. pixel_values_videos = inputs.pop("pixel_values_videos", None) video_grid_thw = inputs.pop("video_grid_thw", None) if "second_per_grid_ts" in inputs: inputs.pop("second_per_grid_ts") vision_input = None if pixel_values_videos is not None and video_grid_thw is not None: vision_input = { "pixel_values_videos": pixel_values_videos, "video_grid_thw": video_grid_thw, } # Build question_input_ids from the raw textual question. tok = getattr(processor, "tokenizer", None) question_ids = None if tok is not None and question: question_ids = tok( question, return_tensors="pt", add_special_tokens=False, ).input_ids.to(DEVICE) with torch.no_grad(): generated_ids = model.generate( vision_input=vision_input, input_ids=inputs.input_ids, question_input_ids=question_ids if question_ids is not None else inputs.input_ids, max_new_tokens=int(max_new_tokens), ) # Remove the prompt tokens from the generated sequence. generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True, ) # Fallback: if trimming yields empty text, decode full sequences instead. if not output_text or not output_text[0].strip(): output_text = [ processor.decode(ids, skip_special_tokens=True) for ids in generated_ids ] return output_text[0] if output_text else "" with gr.Blocks() as demo: gr.Markdown("# QTSplus-3B Video QA Demo") with gr.Row(): video = gr.Video(label="Video") with gr.Column(): question_box = gr.Textbox( label="Question", lines=3, value=DEFAULT_QUESTION, ) max_tokens = gr.Slider( minimum=16, maximum=512, step=16, value=256, label="Max new tokens", ) run_button = gr.Button("Run") output_box = gr.Textbox(label="Model answer", lines=6) run_button.click( fn=qtsplus_generate, inputs=[video, question_box, max_tokens], outputs=output_box, ) if __name__ == "__main__": demo.queue().launch(ssr_mode=False)