File size: 6,861 Bytes
a09f733
 
 
 
 
 
 
581ff0e
 
a09f733
 
 
ade0d84
d7f000f
a09f733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ade0d84
a09f733
 
b25bc50
 
 
a09f733
 
 
 
 
 
 
 
 
 
 
 
 
 
b25bc50
 
 
a09f733
 
 
b25bc50
a09f733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7f000f
a09f733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b25bc50
a09f733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b25bc50
 
a09f733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e785234
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""
Gradio app for running QTSplus on Hugging Face Spaces.

This follows the inference example in README.md and uses the
`AlpachinoNLP/QTSplus-3B` Hugging Face model.
"""

from __future__ import annotations
import spaces
import os
import sys
from typing import Optional, List, Tuple
os.system('pip3 install transformers==4.57.1 huggingface_hub>=1.01 av qwen-vl-utils accelerate sentencepiece bitsandbytes -U')
os.system('pip3 install torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu128 -U')
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from typing import Optional, List, Tuple

# Ensure project root (which contains `src/`) is on PYTHONPATH so we can
# reuse the local vision processing utilities instead of relying on
# external `qwen_vl_utils`.
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from qwen_vl_utils import process_vision_info


DEFAULT_MODEL_ID = os.environ.get("QTSPLUS_MODEL_ID", "AlpachinoNLP/QTSplus-3B")
DEFAULT_QUESTION = "What is happening in the video?"

USE_CUDA = True  # set to False if you ever run this Space on CPU-only hardware
DEVICE = "cuda" if USE_CUDA else "cpu"
DTYPE = torch.bfloat16 

_MODEL: Optional[AutoModelForCausalLM] = None
_PROCESSOR: Optional[AutoProcessor] = None


def load_model_and_processor() -> Tuple[AutoModelForCausalLM, AutoProcessor]:
    """Lazy-load the QTSplus model and processor."""
    global _MODEL, _PROCESSOR
    if _MODEL is not None and _PROCESSOR is not None:
        return _MODEL, _PROCESSOR

    model_id = DEFAULT_MODEL_ID

    model_kwargs = {"trust_remote_code": True, "torch_dtype": DTYPE}
    # if DEVICE == "cuda":
    #     # Let Transformers place layers automatically on the available GPU.
    #     model_kwargs["device_map"] = "auto"

    _MODEL = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
    _MODEL.eval()
    _MODEL.to(DEVICE)

    _PROCESSOR = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

    # Ensure generation uses explicit pad/eos ids.
    tok = getattr(_PROCESSOR, "tokenizer", None)
    if tok is not None:
        if getattr(_MODEL.config, "pad_token_id", None) is None:
            _MODEL.config.pad_token_id = tok.pad_token_id or tok.eos_token_id
        if getattr(_MODEL.config, "eos_token_id", None) is None:
            _MODEL.config.eos_token_id = tok.eos_token_id
        if hasattr(_MODEL, "generation_config") and _MODEL.generation_config is not None:
            _MODEL.generation_config.pad_token_id = _MODEL.config.pad_token_id
            _MODEL.generation_config.eos_token_id = _MODEL.config.eos_token_id

    return _MODEL, _PROCESSOR


# # Preload model and processor at import time (for faster first inference).
load_model_and_processor()


def build_messages(video: Optional[str], prompt: str) -> List[dict]:
    """Build chat-style messages for a single video + question."""
    if not video:
        raise ValueError("Please upload a video before running the model.")

    return [
        {
            "role": "user",
            "content": [
                {"type": "video", "video": video, "max_pixels": 360 * 420, "fps": 1.0},
                {"type": "text", "text": prompt or DEFAULT_QUESTION},
            ],
        }
    ]

@spaces.GPU(duration=120)
def qtsplus_generate(video_path: Optional[str], question: str, max_new_tokens: int = 256) -> str:
    """Run QTSplus on the given video and question."""
    if not video_path:
        return "Please upload a video first."
    video_path.replace("file://","")
    model, processor = load_model_and_processor()

    messages = build_messages(video_path, question)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **(video_kwargs or {}),
    )

    # Move inputs to the correct device and dtype.
    inputs = inputs.to(dtype=DTYPE, device=DEVICE)
    

    # Extract vision tensors for QTSplus-specific `vision_input` argument.
    pixel_values_videos = inputs.pop("pixel_values_videos", None)
    video_grid_thw = inputs.pop("video_grid_thw", None)
    if "second_per_grid_ts" in inputs:
        inputs.pop("second_per_grid_ts")

    vision_input = None
    if pixel_values_videos is not None and video_grid_thw is not None:
        vision_input = {
            "pixel_values_videos": pixel_values_videos,
            "video_grid_thw": video_grid_thw,
        }

    # Build question_input_ids from the raw textual question.
    tok = getattr(processor, "tokenizer", None)
    question_ids = None
    if tok is not None and question:
        question_ids = tok(
            question,
            return_tensors="pt",
            add_special_tokens=False,
        ).input_ids.to(DEVICE)

    with torch.no_grad():
        generated_ids = model.generate(
            vision_input=vision_input,
            input_ids=inputs.input_ids,
            question_input_ids=question_ids if question_ids is not None else inputs.input_ids,
            max_new_tokens=int(max_new_tokens),
        )

    # Remove the prompt tokens from the generated sequence.
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

    # Fallback: if trimming yields empty text, decode full sequences instead.
    if not output_text or not output_text[0].strip():
        output_text = [
            processor.decode(ids, skip_special_tokens=True)
            for ids in generated_ids
        ]

    return output_text[0] if output_text else ""


with gr.Blocks() as demo:
    gr.Markdown("# QTSplus-3B Video QA Demo")

    with gr.Row():
        video = gr.Video(label="Video")
        with gr.Column():
            question_box = gr.Textbox(
                label="Question",
                lines=3,
                value=DEFAULT_QUESTION,
            )
            max_tokens = gr.Slider(
                minimum=16,
                maximum=512,
                step=16,
                value=256,
                label="Max new tokens",
            )
            run_button = gr.Button("Run")

    output_box = gr.Textbox(label="Model answer", lines=6)

    run_button.click(
        fn=qtsplus_generate,
        inputs=[video, question_box, max_tokens],
        outputs=output_box,
    )


if __name__ == "__main__":
    demo.queue().launch(ssr_mode=False)