Spaces:
Running
on
Zero
Running
on
Zero
Upload 3 files
Browse files- README (2).md +25 -0
- app.py +202 -0
- gitattributes.txt +35 -0
README (2).md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: QTSplus 3B
|
| 3 |
+
emoji: 💻
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 6.0.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
## ✨ Cite our work
|
| 13 |
+
If you find this repo useful, please consider citing:
|
| 14 |
+
|
| 15 |
+
```bibtex
|
| 16 |
+
@misc{li2025seeingforesttreesqueryaware,
|
| 17 |
+
title = {Seeing the Forest and the Trees: Query-Aware Tokenizer for Long-Video Multimodal Language Models},
|
| 18 |
+
author = {Siyou Li and Huanan Wu and Juexi Shao and Yinghao Ma and Yujian Gan and Yihao Luo and Yuwei Wang and Dong Nie and Lu Wang and Wengqing Wu and Le Zhang and Massimo Poesio and Juntao Yu},
|
| 19 |
+
year = {2025},
|
| 20 |
+
eprint = {2511.11910},
|
| 21 |
+
archivePrefix= {arXiv},
|
| 22 |
+
primaryClass = {cs.CV},
|
| 23 |
+
url = {https://arxiv.org/abs/2511.11910}
|
| 24 |
+
}
|
| 25 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Gradio app for running QTSplus on Hugging Face Spaces.
|
| 3 |
+
|
| 4 |
+
This follows the inference example in README.md and uses the
|
| 5 |
+
`AlpachinoNLP/QTSplus-3B` Hugging Face model.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import sys
|
| 12 |
+
from typing import Optional, List, Tuple
|
| 13 |
+
os.system('pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 --index-url https://download.pytorch.org/whl/cpu -U')
|
| 14 |
+
os.system('pip install transformers==4.57.1 av qwen-vl-utils sentencepiece bitsandbytes -U')
|
| 15 |
+
import gradio as gr
|
| 16 |
+
import torch
|
| 17 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 18 |
+
from typing import Optional, List, Tuple
|
| 19 |
+
|
| 20 |
+
# Ensure project root (which contains `src/`) is on PYTHONPATH so we can
|
| 21 |
+
# reuse the local vision processing utilities instead of relying on
|
| 22 |
+
# external `qwen_vl_utils`.
|
| 23 |
+
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 24 |
+
if PROJECT_ROOT not in sys.path:
|
| 25 |
+
sys.path.insert(0, PROJECT_ROOT)
|
| 26 |
+
|
| 27 |
+
from qwen_vl_utils import process_vision_info
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
DEFAULT_MODEL_ID = os.environ.get("QTSPLUS_MODEL_ID", "AlpachinoNLP/QTSplus-3B")
|
| 31 |
+
DEFAULT_QUESTION = "What is happening in the video?"
|
| 32 |
+
|
| 33 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 34 |
+
DTYPE = torch.bfloat16 if DEVICE.type == "cuda" else torch.float16
|
| 35 |
+
|
| 36 |
+
_MODEL: Optional[AutoModelForCausalLM] = None
|
| 37 |
+
_PROCESSOR: Optional[AutoProcessor] = None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def load_model_and_processor() -> Tuple[AutoModelForCausalLM, AutoProcessor]:
|
| 41 |
+
"""Lazy-load the QTSplus model and processor."""
|
| 42 |
+
global _MODEL, _PROCESSOR
|
| 43 |
+
if _MODEL is not None and _PROCESSOR is not None:
|
| 44 |
+
return _MODEL, _PROCESSOR
|
| 45 |
+
|
| 46 |
+
model_id = DEFAULT_MODEL_ID
|
| 47 |
+
|
| 48 |
+
model_kwargs = {"trust_remote_code": True, "torch_dtype": DTYPE}
|
| 49 |
+
if DEVICE.type == "cuda":
|
| 50 |
+
# Let Transformers place layers automatically on the available GPU.
|
| 51 |
+
model_kwargs["device_map"] = "auto"
|
| 52 |
+
|
| 53 |
+
_MODEL = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
|
| 54 |
+
_MODEL.eval()
|
| 55 |
+
if DEVICE.type != "cuda":
|
| 56 |
+
_MODEL.to(DEVICE)
|
| 57 |
+
|
| 58 |
+
_PROCESSOR = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
| 59 |
+
|
| 60 |
+
# Ensure generation uses explicit pad/eos ids.
|
| 61 |
+
tok = getattr(_PROCESSOR, "tokenizer", None)
|
| 62 |
+
if tok is not None:
|
| 63 |
+
if getattr(_MODEL.config, "pad_token_id", None) is None:
|
| 64 |
+
_MODEL.config.pad_token_id = tok.pad_token_id or tok.eos_token_id
|
| 65 |
+
if getattr(_MODEL.config, "eos_token_id", None) is None:
|
| 66 |
+
_MODEL.config.eos_token_id = tok.eos_token_id
|
| 67 |
+
if hasattr(_MODEL, "generation_config") and _MODEL.generation_config is not None:
|
| 68 |
+
_MODEL.generation_config.pad_token_id = _MODEL.config.pad_token_id
|
| 69 |
+
_MODEL.generation_config.eos_token_id = _MODEL.config.eos_token_id
|
| 70 |
+
|
| 71 |
+
return _MODEL, _PROCESSOR
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# # Preload model and processor at import time (for faster first inference).
|
| 75 |
+
# load_model_and_processor()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def build_messages(video: Optional[str], prompt: str) -> List[dict]:
|
| 79 |
+
"""Build chat-style messages for a single video + question."""
|
| 80 |
+
if not video:
|
| 81 |
+
raise ValueError("Please upload a video before running the model.")
|
| 82 |
+
|
| 83 |
+
return [
|
| 84 |
+
{
|
| 85 |
+
"role": "user",
|
| 86 |
+
"content": [
|
| 87 |
+
{"type": "video", "video": video, "max_pixels": 360 * 420, "fps": 1.0},
|
| 88 |
+
{"type": "text", "text": prompt or DEFAULT_QUESTION},
|
| 89 |
+
],
|
| 90 |
+
}
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def qtsplus_generate(video_path: Optional[str], question: str, max_new_tokens: int = 256) -> str:
|
| 95 |
+
"""Run QTSplus on the given video and question."""
|
| 96 |
+
if not video_path:
|
| 97 |
+
return "Please upload a video first."
|
| 98 |
+
video_path.replace("file://","")
|
| 99 |
+
model, processor = load_model_and_processor()
|
| 100 |
+
|
| 101 |
+
messages = build_messages(video_path, question)
|
| 102 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 103 |
+
|
| 104 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
|
| 105 |
+
|
| 106 |
+
inputs = processor(
|
| 107 |
+
text=[text],
|
| 108 |
+
images=image_inputs,
|
| 109 |
+
videos=video_inputs,
|
| 110 |
+
padding=True,
|
| 111 |
+
return_tensors="pt",
|
| 112 |
+
**(video_kwargs or {}),
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
# Move inputs to the correct device and dtype.
|
| 116 |
+
if DEVICE.type == "cuda":
|
| 117 |
+
inputs = inputs.to(dtype=DTYPE, device=DEVICE)
|
| 118 |
+
else:
|
| 119 |
+
inputs = inputs.to(device=DEVICE)
|
| 120 |
+
|
| 121 |
+
# Extract vision tensors for QTSplus-specific `vision_input` argument.
|
| 122 |
+
pixel_values_videos = inputs.pop("pixel_values_videos", None)
|
| 123 |
+
video_grid_thw = inputs.pop("video_grid_thw", None)
|
| 124 |
+
if "second_per_grid_ts" in inputs:
|
| 125 |
+
inputs.pop("second_per_grid_ts")
|
| 126 |
+
|
| 127 |
+
vision_input = None
|
| 128 |
+
if pixel_values_videos is not None and video_grid_thw is not None:
|
| 129 |
+
vision_input = {
|
| 130 |
+
"pixel_values_videos": pixel_values_videos,
|
| 131 |
+
"video_grid_thw": video_grid_thw,
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
# Build question_input_ids from the raw textual question.
|
| 135 |
+
tok = getattr(processor, "tokenizer", None)
|
| 136 |
+
question_ids = None
|
| 137 |
+
if tok is not None and question:
|
| 138 |
+
question_ids = tok(
|
| 139 |
+
question,
|
| 140 |
+
return_tensors="pt",
|
| 141 |
+
add_special_tokens=False,
|
| 142 |
+
).input_ids.to(DEVICE)
|
| 143 |
+
|
| 144 |
+
with torch.no_grad():
|
| 145 |
+
generated_ids = model.generate(
|
| 146 |
+
vision_input=vision_input,
|
| 147 |
+
input_ids=inputs.input_ids,
|
| 148 |
+
question_input_ids=question_ids if question_ids is not None else inputs.input_ids,
|
| 149 |
+
max_new_tokens=int(max_new_tokens),
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Remove the prompt tokens from the generated sequence.
|
| 153 |
+
generated_ids_trimmed = [
|
| 154 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 155 |
+
]
|
| 156 |
+
output_text = processor.batch_decode(
|
| 157 |
+
generated_ids_trimmed,
|
| 158 |
+
skip_special_tokens=True,
|
| 159 |
+
clean_up_tokenization_spaces=True,
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Fallback: if trimming yields empty text, decode full sequences instead.
|
| 163 |
+
if not output_text or not output_text[0].strip():
|
| 164 |
+
output_text = [
|
| 165 |
+
processor.decode(ids, skip_special_tokens=True)
|
| 166 |
+
for ids in generated_ids
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
return output_text[0] if output_text else ""
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
with gr.Blocks() as demo:
|
| 173 |
+
gr.Markdown("# QTSplus-3B Video QA Demo")
|
| 174 |
+
|
| 175 |
+
with gr.Row():
|
| 176 |
+
video = gr.Video(label="Video")
|
| 177 |
+
with gr.Column():
|
| 178 |
+
question_box = gr.Textbox(
|
| 179 |
+
label="Question",
|
| 180 |
+
lines=3,
|
| 181 |
+
value=DEFAULT_QUESTION,
|
| 182 |
+
)
|
| 183 |
+
max_tokens = gr.Slider(
|
| 184 |
+
minimum=16,
|
| 185 |
+
maximum=512,
|
| 186 |
+
step=16,
|
| 187 |
+
value=256,
|
| 188 |
+
label="Max new tokens",
|
| 189 |
+
)
|
| 190 |
+
run_button = gr.Button("Run")
|
| 191 |
+
|
| 192 |
+
output_box = gr.Textbox(label="Model answer", lines=6)
|
| 193 |
+
|
| 194 |
+
run_button.click(
|
| 195 |
+
fn=qtsplus_generate,
|
| 196 |
+
inputs=[video, question_box, max_tokens],
|
| 197 |
+
outputs=output_box,
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
if __name__ == "__main__":
|
| 202 |
+
demo.queue().launch()
|
gitattributes.txt
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|