Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,861 Bytes
a09f733 581ff0e a09f733 ade0d84 d7f000f a09f733 ade0d84 a09f733 b25bc50 a09f733 b25bc50 a09f733 b25bc50 a09f733 d7f000f a09f733 b25bc50 a09f733 b25bc50 a09f733 e785234 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
"""
Gradio app for running QTSplus on Hugging Face Spaces.
This follows the inference example in README.md and uses the
`AlpachinoNLP/QTSplus-3B` Hugging Face model.
"""
from __future__ import annotations
import spaces
import os
import sys
from typing import Optional, List, Tuple
os.system('pip3 install transformers==4.57.1 huggingface_hub>=1.01 av qwen-vl-utils accelerate sentencepiece bitsandbytes -U')
os.system('pip3 install torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu128 -U')
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from typing import Optional, List, Tuple
# Ensure project root (which contains `src/`) is on PYTHONPATH so we can
# reuse the local vision processing utilities instead of relying on
# external `qwen_vl_utils`.
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if PROJECT_ROOT not in sys.path:
sys.path.insert(0, PROJECT_ROOT)
from qwen_vl_utils import process_vision_info
DEFAULT_MODEL_ID = os.environ.get("QTSPLUS_MODEL_ID", "AlpachinoNLP/QTSplus-3B")
DEFAULT_QUESTION = "What is happening in the video?"
USE_CUDA = True # set to False if you ever run this Space on CPU-only hardware
DEVICE = "cuda" if USE_CUDA else "cpu"
DTYPE = torch.bfloat16
_MODEL: Optional[AutoModelForCausalLM] = None
_PROCESSOR: Optional[AutoProcessor] = None
def load_model_and_processor() -> Tuple[AutoModelForCausalLM, AutoProcessor]:
"""Lazy-load the QTSplus model and processor."""
global _MODEL, _PROCESSOR
if _MODEL is not None and _PROCESSOR is not None:
return _MODEL, _PROCESSOR
model_id = DEFAULT_MODEL_ID
model_kwargs = {"trust_remote_code": True, "torch_dtype": DTYPE}
# if DEVICE == "cuda":
# # Let Transformers place layers automatically on the available GPU.
# model_kwargs["device_map"] = "auto"
_MODEL = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
_MODEL.eval()
_MODEL.to(DEVICE)
_PROCESSOR = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Ensure generation uses explicit pad/eos ids.
tok = getattr(_PROCESSOR, "tokenizer", None)
if tok is not None:
if getattr(_MODEL.config, "pad_token_id", None) is None:
_MODEL.config.pad_token_id = tok.pad_token_id or tok.eos_token_id
if getattr(_MODEL.config, "eos_token_id", None) is None:
_MODEL.config.eos_token_id = tok.eos_token_id
if hasattr(_MODEL, "generation_config") and _MODEL.generation_config is not None:
_MODEL.generation_config.pad_token_id = _MODEL.config.pad_token_id
_MODEL.generation_config.eos_token_id = _MODEL.config.eos_token_id
return _MODEL, _PROCESSOR
# # Preload model and processor at import time (for faster first inference).
load_model_and_processor()
def build_messages(video: Optional[str], prompt: str) -> List[dict]:
"""Build chat-style messages for a single video + question."""
if not video:
raise ValueError("Please upload a video before running the model.")
return [
{
"role": "user",
"content": [
{"type": "video", "video": video, "max_pixels": 360 * 420, "fps": 1.0},
{"type": "text", "text": prompt or DEFAULT_QUESTION},
],
}
]
@spaces.GPU(duration=120)
def qtsplus_generate(video_path: Optional[str], question: str, max_new_tokens: int = 256) -> str:
"""Run QTSplus on the given video and question."""
if not video_path:
return "Please upload a video first."
video_path.replace("file://","")
model, processor = load_model_and_processor()
messages = build_messages(video_path, question)
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
**(video_kwargs or {}),
)
# Move inputs to the correct device and dtype.
inputs = inputs.to(dtype=DTYPE, device=DEVICE)
# Extract vision tensors for QTSplus-specific `vision_input` argument.
pixel_values_videos = inputs.pop("pixel_values_videos", None)
video_grid_thw = inputs.pop("video_grid_thw", None)
if "second_per_grid_ts" in inputs:
inputs.pop("second_per_grid_ts")
vision_input = None
if pixel_values_videos is not None and video_grid_thw is not None:
vision_input = {
"pixel_values_videos": pixel_values_videos,
"video_grid_thw": video_grid_thw,
}
# Build question_input_ids from the raw textual question.
tok = getattr(processor, "tokenizer", None)
question_ids = None
if tok is not None and question:
question_ids = tok(
question,
return_tensors="pt",
add_special_tokens=False,
).input_ids.to(DEVICE)
with torch.no_grad():
generated_ids = model.generate(
vision_input=vision_input,
input_ids=inputs.input_ids,
question_input_ids=question_ids if question_ids is not None else inputs.input_ids,
max_new_tokens=int(max_new_tokens),
)
# Remove the prompt tokens from the generated sequence.
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
# Fallback: if trimming yields empty text, decode full sequences instead.
if not output_text or not output_text[0].strip():
output_text = [
processor.decode(ids, skip_special_tokens=True)
for ids in generated_ids
]
return output_text[0] if output_text else ""
with gr.Blocks() as demo:
gr.Markdown("# QTSplus-3B Video QA Demo")
with gr.Row():
video = gr.Video(label="Video")
with gr.Column():
question_box = gr.Textbox(
label="Question",
lines=3,
value=DEFAULT_QUESTION,
)
max_tokens = gr.Slider(
minimum=16,
maximum=512,
step=16,
value=256,
label="Max new tokens",
)
run_button = gr.Button("Run")
output_box = gr.Textbox(label="Model answer", lines=6)
run_button.click(
fn=qtsplus_generate,
inputs=[video, question_box, max_tokens],
outputs=output_box,
)
if __name__ == "__main__":
demo.queue().launch(ssr_mode=False)
|