SiyouLi commited on
Commit
a09f733
·
verified ·
1 Parent(s): fbdd7f9

Upload 3 files

Browse files
Files changed (3) hide show
  1. README (2).md +25 -0
  2. app.py +202 -0
  3. gitattributes.txt +35 -0
README (2).md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: QTSplus 3B
3
+ emoji: 💻
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 6.0.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+ ## ✨ Cite our work
13
+ If you find this repo useful, please consider citing:
14
+
15
+ ```bibtex
16
+ @misc{li2025seeingforesttreesqueryaware,
17
+ title = {Seeing the Forest and the Trees: Query-Aware Tokenizer for Long-Video Multimodal Language Models},
18
+ author = {Siyou Li and Huanan Wu and Juexi Shao and Yinghao Ma and Yujian Gan and Yihao Luo and Yuwei Wang and Dong Nie and Lu Wang and Wengqing Wu and Le Zhang and Massimo Poesio and Juntao Yu},
19
+ year = {2025},
20
+ eprint = {2511.11910},
21
+ archivePrefix= {arXiv},
22
+ primaryClass = {cs.CV},
23
+ url = {https://arxiv.org/abs/2511.11910}
24
+ }
25
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio app for running QTSplus on Hugging Face Spaces.
3
+
4
+ This follows the inference example in README.md and uses the
5
+ `AlpachinoNLP/QTSplus-3B` Hugging Face model.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import sys
12
+ from typing import Optional, List, Tuple
13
+ os.system('pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 --index-url https://download.pytorch.org/whl/cpu -U')
14
+ os.system('pip install transformers==4.57.1 av qwen-vl-utils sentencepiece bitsandbytes -U')
15
+ import gradio as gr
16
+ import torch
17
+ from transformers import AutoModelForCausalLM, AutoProcessor
18
+ from typing import Optional, List, Tuple
19
+
20
+ # Ensure project root (which contains `src/`) is on PYTHONPATH so we can
21
+ # reuse the local vision processing utilities instead of relying on
22
+ # external `qwen_vl_utils`.
23
+ PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
24
+ if PROJECT_ROOT not in sys.path:
25
+ sys.path.insert(0, PROJECT_ROOT)
26
+
27
+ from qwen_vl_utils import process_vision_info
28
+
29
+
30
+ DEFAULT_MODEL_ID = os.environ.get("QTSPLUS_MODEL_ID", "AlpachinoNLP/QTSplus-3B")
31
+ DEFAULT_QUESTION = "What is happening in the video?"
32
+
33
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
+ DTYPE = torch.bfloat16 if DEVICE.type == "cuda" else torch.float16
35
+
36
+ _MODEL: Optional[AutoModelForCausalLM] = None
37
+ _PROCESSOR: Optional[AutoProcessor] = None
38
+
39
+
40
+ def load_model_and_processor() -> Tuple[AutoModelForCausalLM, AutoProcessor]:
41
+ """Lazy-load the QTSplus model and processor."""
42
+ global _MODEL, _PROCESSOR
43
+ if _MODEL is not None and _PROCESSOR is not None:
44
+ return _MODEL, _PROCESSOR
45
+
46
+ model_id = DEFAULT_MODEL_ID
47
+
48
+ model_kwargs = {"trust_remote_code": True, "torch_dtype": DTYPE}
49
+ if DEVICE.type == "cuda":
50
+ # Let Transformers place layers automatically on the available GPU.
51
+ model_kwargs["device_map"] = "auto"
52
+
53
+ _MODEL = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
54
+ _MODEL.eval()
55
+ if DEVICE.type != "cuda":
56
+ _MODEL.to(DEVICE)
57
+
58
+ _PROCESSOR = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
59
+
60
+ # Ensure generation uses explicit pad/eos ids.
61
+ tok = getattr(_PROCESSOR, "tokenizer", None)
62
+ if tok is not None:
63
+ if getattr(_MODEL.config, "pad_token_id", None) is None:
64
+ _MODEL.config.pad_token_id = tok.pad_token_id or tok.eos_token_id
65
+ if getattr(_MODEL.config, "eos_token_id", None) is None:
66
+ _MODEL.config.eos_token_id = tok.eos_token_id
67
+ if hasattr(_MODEL, "generation_config") and _MODEL.generation_config is not None:
68
+ _MODEL.generation_config.pad_token_id = _MODEL.config.pad_token_id
69
+ _MODEL.generation_config.eos_token_id = _MODEL.config.eos_token_id
70
+
71
+ return _MODEL, _PROCESSOR
72
+
73
+
74
+ # # Preload model and processor at import time (for faster first inference).
75
+ # load_model_and_processor()
76
+
77
+
78
+ def build_messages(video: Optional[str], prompt: str) -> List[dict]:
79
+ """Build chat-style messages for a single video + question."""
80
+ if not video:
81
+ raise ValueError("Please upload a video before running the model.")
82
+
83
+ return [
84
+ {
85
+ "role": "user",
86
+ "content": [
87
+ {"type": "video", "video": video, "max_pixels": 360 * 420, "fps": 1.0},
88
+ {"type": "text", "text": prompt or DEFAULT_QUESTION},
89
+ ],
90
+ }
91
+ ]
92
+
93
+
94
+ def qtsplus_generate(video_path: Optional[str], question: str, max_new_tokens: int = 256) -> str:
95
+ """Run QTSplus on the given video and question."""
96
+ if not video_path:
97
+ return "Please upload a video first."
98
+ video_path.replace("file://","")
99
+ model, processor = load_model_and_processor()
100
+
101
+ messages = build_messages(video_path, question)
102
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
103
+
104
+ image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
105
+
106
+ inputs = processor(
107
+ text=[text],
108
+ images=image_inputs,
109
+ videos=video_inputs,
110
+ padding=True,
111
+ return_tensors="pt",
112
+ **(video_kwargs or {}),
113
+ )
114
+
115
+ # Move inputs to the correct device and dtype.
116
+ if DEVICE.type == "cuda":
117
+ inputs = inputs.to(dtype=DTYPE, device=DEVICE)
118
+ else:
119
+ inputs = inputs.to(device=DEVICE)
120
+
121
+ # Extract vision tensors for QTSplus-specific `vision_input` argument.
122
+ pixel_values_videos = inputs.pop("pixel_values_videos", None)
123
+ video_grid_thw = inputs.pop("video_grid_thw", None)
124
+ if "second_per_grid_ts" in inputs:
125
+ inputs.pop("second_per_grid_ts")
126
+
127
+ vision_input = None
128
+ if pixel_values_videos is not None and video_grid_thw is not None:
129
+ vision_input = {
130
+ "pixel_values_videos": pixel_values_videos,
131
+ "video_grid_thw": video_grid_thw,
132
+ }
133
+
134
+ # Build question_input_ids from the raw textual question.
135
+ tok = getattr(processor, "tokenizer", None)
136
+ question_ids = None
137
+ if tok is not None and question:
138
+ question_ids = tok(
139
+ question,
140
+ return_tensors="pt",
141
+ add_special_tokens=False,
142
+ ).input_ids.to(DEVICE)
143
+
144
+ with torch.no_grad():
145
+ generated_ids = model.generate(
146
+ vision_input=vision_input,
147
+ input_ids=inputs.input_ids,
148
+ question_input_ids=question_ids if question_ids is not None else inputs.input_ids,
149
+ max_new_tokens=int(max_new_tokens),
150
+ )
151
+
152
+ # Remove the prompt tokens from the generated sequence.
153
+ generated_ids_trimmed = [
154
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
155
+ ]
156
+ output_text = processor.batch_decode(
157
+ generated_ids_trimmed,
158
+ skip_special_tokens=True,
159
+ clean_up_tokenization_spaces=True,
160
+ )
161
+
162
+ # Fallback: if trimming yields empty text, decode full sequences instead.
163
+ if not output_text or not output_text[0].strip():
164
+ output_text = [
165
+ processor.decode(ids, skip_special_tokens=True)
166
+ for ids in generated_ids
167
+ ]
168
+
169
+ return output_text[0] if output_text else ""
170
+
171
+
172
+ with gr.Blocks() as demo:
173
+ gr.Markdown("# QTSplus-3B Video QA Demo")
174
+
175
+ with gr.Row():
176
+ video = gr.Video(label="Video")
177
+ with gr.Column():
178
+ question_box = gr.Textbox(
179
+ label="Question",
180
+ lines=3,
181
+ value=DEFAULT_QUESTION,
182
+ )
183
+ max_tokens = gr.Slider(
184
+ minimum=16,
185
+ maximum=512,
186
+ step=16,
187
+ value=256,
188
+ label="Max new tokens",
189
+ )
190
+ run_button = gr.Button("Run")
191
+
192
+ output_box = gr.Textbox(label="Model answer", lines=6)
193
+
194
+ run_button.click(
195
+ fn=qtsplus_generate,
196
+ inputs=[video, question_box, max_tokens],
197
+ outputs=output_box,
198
+ )
199
+
200
+
201
+ if __name__ == "__main__":
202
+ demo.queue().launch()
gitattributes.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text