QTSplus-3B / app.py
SiyouLi's picture
Update app.py
ade0d84 verified
"""
Gradio app for running QTSplus on Hugging Face Spaces.
This follows the inference example in README.md and uses the
`AlpachinoNLP/QTSplus-3B` Hugging Face model.
"""
from __future__ import annotations
import spaces
import os
import sys
from typing import Optional, List, Tuple
os.system('pip3 install transformers==4.57.1 huggingface_hub>=1.01 av qwen-vl-utils accelerate sentencepiece bitsandbytes -U')
os.system('pip3 install torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu128 -U')
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from typing import Optional, List, Tuple
# Ensure project root (which contains `src/`) is on PYTHONPATH so we can
# reuse the local vision processing utilities instead of relying on
# external `qwen_vl_utils`.
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if PROJECT_ROOT not in sys.path:
sys.path.insert(0, PROJECT_ROOT)
from qwen_vl_utils import process_vision_info
DEFAULT_MODEL_ID = os.environ.get("QTSPLUS_MODEL_ID", "AlpachinoNLP/QTSplus-3B")
DEFAULT_QUESTION = "What is happening in the video?"
USE_CUDA = True # set to False if you ever run this Space on CPU-only hardware
DEVICE = "cuda" if USE_CUDA else "cpu"
DTYPE = torch.bfloat16
_MODEL: Optional[AutoModelForCausalLM] = None
_PROCESSOR: Optional[AutoProcessor] = None
def load_model_and_processor() -> Tuple[AutoModelForCausalLM, AutoProcessor]:
"""Lazy-load the QTSplus model and processor."""
global _MODEL, _PROCESSOR
if _MODEL is not None and _PROCESSOR is not None:
return _MODEL, _PROCESSOR
model_id = DEFAULT_MODEL_ID
model_kwargs = {"trust_remote_code": True, "torch_dtype": DTYPE}
# if DEVICE == "cuda":
# # Let Transformers place layers automatically on the available GPU.
# model_kwargs["device_map"] = "auto"
_MODEL = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
_MODEL.eval()
_MODEL.to(DEVICE)
_PROCESSOR = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Ensure generation uses explicit pad/eos ids.
tok = getattr(_PROCESSOR, "tokenizer", None)
if tok is not None:
if getattr(_MODEL.config, "pad_token_id", None) is None:
_MODEL.config.pad_token_id = tok.pad_token_id or tok.eos_token_id
if getattr(_MODEL.config, "eos_token_id", None) is None:
_MODEL.config.eos_token_id = tok.eos_token_id
if hasattr(_MODEL, "generation_config") and _MODEL.generation_config is not None:
_MODEL.generation_config.pad_token_id = _MODEL.config.pad_token_id
_MODEL.generation_config.eos_token_id = _MODEL.config.eos_token_id
return _MODEL, _PROCESSOR
# # Preload model and processor at import time (for faster first inference).
load_model_and_processor()
def build_messages(video: Optional[str], prompt: str) -> List[dict]:
"""Build chat-style messages for a single video + question."""
if not video:
raise ValueError("Please upload a video before running the model.")
return [
{
"role": "user",
"content": [
{"type": "video", "video": video, "max_pixels": 360 * 420, "fps": 1.0},
{"type": "text", "text": prompt or DEFAULT_QUESTION},
],
}
]
@spaces.GPU(duration=120)
def qtsplus_generate(video_path: Optional[str], question: str, max_new_tokens: int = 256) -> str:
"""Run QTSplus on the given video and question."""
if not video_path:
return "Please upload a video first."
video_path.replace("file://","")
model, processor = load_model_and_processor()
messages = build_messages(video_path, question)
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
**(video_kwargs or {}),
)
# Move inputs to the correct device and dtype.
inputs = inputs.to(dtype=DTYPE, device=DEVICE)
# Extract vision tensors for QTSplus-specific `vision_input` argument.
pixel_values_videos = inputs.pop("pixel_values_videos", None)
video_grid_thw = inputs.pop("video_grid_thw", None)
if "second_per_grid_ts" in inputs:
inputs.pop("second_per_grid_ts")
vision_input = None
if pixel_values_videos is not None and video_grid_thw is not None:
vision_input = {
"pixel_values_videos": pixel_values_videos,
"video_grid_thw": video_grid_thw,
}
# Build question_input_ids from the raw textual question.
tok = getattr(processor, "tokenizer", None)
question_ids = None
if tok is not None and question:
question_ids = tok(
question,
return_tensors="pt",
add_special_tokens=False,
).input_ids.to(DEVICE)
with torch.no_grad():
generated_ids = model.generate(
vision_input=vision_input,
input_ids=inputs.input_ids,
question_input_ids=question_ids if question_ids is not None else inputs.input_ids,
max_new_tokens=int(max_new_tokens),
)
# Remove the prompt tokens from the generated sequence.
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
# Fallback: if trimming yields empty text, decode full sequences instead.
if not output_text or not output_text[0].strip():
output_text = [
processor.decode(ids, skip_special_tokens=True)
for ids in generated_ids
]
return output_text[0] if output_text else ""
with gr.Blocks() as demo:
gr.Markdown("# QTSplus-3B Video QA Demo")
with gr.Row():
video = gr.Video(label="Video")
with gr.Column():
question_box = gr.Textbox(
label="Question",
lines=3,
value=DEFAULT_QUESTION,
)
max_tokens = gr.Slider(
minimum=16,
maximum=512,
step=16,
value=256,
label="Max new tokens",
)
run_button = gr.Button("Run")
output_box = gr.Textbox(label="Model answer", lines=6)
run_button.click(
fn=qtsplus_generate,
inputs=[video, question_box, max_tokens],
outputs=output_box,
)
if __name__ == "__main__":
demo.queue().launch(ssr_mode=False)