File size: 3,896 Bytes
cc6f8e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import time, torch, gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText

MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"

# Pick a safe float dtype for your GPU (Ampere+ -> bf16; else fp16; CPU -> fp32)
if torch.cuda.is_available():
    major, _ = torch.cuda.get_device_capability()
    FLOAT_DTYPE = torch.bfloat16 if major >= 8 else torch.float16
else:
    FLOAT_DTYPE = torch.float32

# Load once (faster subsequent runs)
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID, torch_dtype=FLOAT_DTYPE, device_map="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)

def run_video(video_path, prompt, max_new_tokens=256, backend="decord", num_frames=32):
    """video_path is a local file path; backend in {'decord','pyav','opencv','torchvision'}"""
    messages = [{
        "role": "user",
        "content": [
            {"type": "video", "path": video_path},
            {"type": "text",  "text": prompt},
        ],
    }]

    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt",
        video_load_backend=backend, num_frames=num_frames
    )

    # Move tensors to device; keep integer token IDs as int64; cast only floats
    for k, v in list(inputs.items()):
        if isinstance(v, torch.Tensor):
            inputs[k] = v.to(model.device)
    for k, v in list(inputs.items()):
        if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
            inputs[k] = v.to(dtype=FLOAT_DTYPE)

    gen_kwargs = {
        "do_sample": False,
        "max_new_tokens": max_new_tokens,
        "eos_token_id": getattr(model.generation_config, "eos_token_id", None) \
                        or getattr(processor.tokenizer, "eos_token_id", None),
        "pad_token_id": getattr(model.generation_config, "pad_token_id", None) \
                        or getattr(processor.tokenizer, "pad_token_id", None),
    }

    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    t0 = time.perf_counter()
    out_ids = model.generate(**inputs, **gen_kwargs)
    latency = time.perf_counter() - t0

    text = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
    vram_gb = (torch.cuda.max_memory_allocated()/1e9) if torch.cuda.is_available() else 0.0
    tokens_generated = int(out_ids.shape[-1] - inputs["input_ids"].shape[-1])

    # minimal pretty string
    pretty = (f"Latency: {latency:.3f}s | VRAM: {vram_gb:.2f} GB | Tokens: {tokens_generated}\n"
              f"{'-'*40}\n{text.strip()}")
    return pretty

def infer(video, prompt, tokens, frames, backend):
    # gr.Video gives a dict or path depending on version; normalize:
    path = video if isinstance(video, str) else getattr(video, "name", None)
    if not path:
        return "No video file received."
    return run_video(path, prompt, max_new_tokens=tokens, backend=backend, num_frames=frames)

with gr.Blocks() as demo:
    gr.Markdown("## SmolVLM2-256M Video Test\nUpload an MP4 and enter your prompt. "
                "This Space mirrors your Colab test.")
    with gr.Row():
        vid = gr.Video(label="Upload MP4", sources=["upload"], include_audio=False)
        with gr.Column():
            prompt = gr.Textbox(label="Prompt", value="Describe this video to me", lines=2)
            tokens = gr.Slider(32, 512, value=256, step=16, label="max_new_tokens")
            frames = gr.Slider(8, 64, value=32, step=8, label="num_frames (sampling)")
            backend = gr.Dropdown(choices=["decord","pyav","opencv","torchvision"],
                                  value="decord", label="video_load_backend")
            btn = gr.Button("Run")
    out = gr.Textbox(label="Output", lines=15)
    btn.click(fn=infer, inputs=[vid, prompt, tokens, frames, backend], outputs=out)

if __name__ == "__main__":
    demo.launch()