Spaces:
Runtime error
Runtime error
Commit
·
9dfb729
1
Parent(s):
c32e4c7
update
Browse files- app.py +40 -39
- requirements.txt +4 -4
app.py
CHANGED
|
@@ -11,23 +11,10 @@ import torchvision.transforms as transforms
|
|
| 11 |
from decord import VideoReader
|
| 12 |
from PIL import Image, ImageDraw, ImageFont
|
| 13 |
from transformers import AutoModel, AutoTokenizer
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
import spaces
|
| 16 |
-
|
| 17 |
-
title_markdown = ("""
|
| 18 |
-
<div style="display: flex; justify-content: flex-start; align-items: center; text-align: center;">
|
| 19 |
-
<div style="margin-right: 20px; display: flex; align-items: center;">
|
| 20 |
-
<a href="https://github.com/ShareGPT4Omni/ShareGPT4Video" style="text-decoration: none; display: flex; align-items: center;">
|
| 21 |
-
<img src="https://raw.githubusercontent.com/ShareGPT4V/ShareGPT4V-Resources/master/images/share4video_tight.png" alt="ShareGPT4Video🚀" style="max-width: 120px; height: auto;">
|
| 22 |
-
</a>
|
| 23 |
-
</div>
|
| 24 |
-
<div>
|
| 25 |
-
<h1>ShareGPT4Video: Improving Video Understanding and Generation with Better Captions</h1>
|
| 26 |
-
<h5 style="margin: 0;">If you like our project, please give us a star ✨ on Github for the latest update.</h5>
|
| 27 |
-
<h5 style="margin: 0;"> <a href="https://sharegpt4video.github.io/">[Project Page]</a> <a href="https://github.com/ShareGPT4Omni/ShareGPT4Video">[Code]</a> <a href="https://arxiv.org/abs/2406.04325v1">[Paper]</a>
|
| 28 |
-
</div>
|
| 29 |
-
</div>
|
| 30 |
-
""")
|
| 31 |
|
| 32 |
block_css = """
|
| 33 |
#buttons button {
|
|
@@ -35,17 +22,14 @@ block_css = """
|
|
| 35 |
}
|
| 36 |
"""
|
| 37 |
|
| 38 |
-
|
| 39 |
-
### License
|
| 40 |
-
The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
|
| 41 |
-
""")
|
| 42 |
-
|
| 43 |
-
|
| 44 |
new_path = 'Lin-Chen/ShareCaptioner-Video'
|
| 45 |
tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
|
| 46 |
model = AutoModel.from_pretrained(
|
| 47 |
-
new_path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
model.tokenizer = tokenizer
|
| 50 |
|
| 51 |
|
|
@@ -120,7 +104,8 @@ def model_gen(model, text, images, need_bos=True, hd_num=25, max_new_token=2048,
|
|
| 120 |
text_embeds = model.encode_text(
|
| 121 |
subtext, add_special_tokens=need_bos)
|
| 122 |
embeds.append(text_embeds)
|
| 123 |
-
im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
|
|
|
|
| 124 |
need_bos = False
|
| 125 |
if i < len(images):
|
| 126 |
try:
|
|
@@ -129,11 +114,13 @@ def model_gen(model, text, images, need_bos=True, hd_num=25, max_new_token=2048,
|
|
| 129 |
image = images[i].convert('RGB')
|
| 130 |
|
| 131 |
image = HD_transform(image, hd_num=hd_num)
|
| 132 |
-
image = model.vis_processor(image).unsqueeze(0).cuda()
|
|
|
|
| 133 |
image_embeds = model.encode_img(image)
|
| 134 |
print(image_embeds.shape)
|
| 135 |
embeds.append(image_embeds)
|
| 136 |
-
im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
|
|
|
|
| 137 |
pt1 = pts
|
| 138 |
embeds = torch.cat(embeds, dim=1)
|
| 139 |
im_mask = torch.cat(im_mask, dim=1)
|
|
@@ -232,14 +219,17 @@ def encode_resized_image(image_path, max_size=1024):
|
|
| 232 |
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
| 233 |
|
| 234 |
|
| 235 |
-
|
| 236 |
def generate_slidingcaptioning(video_path):
|
| 237 |
imgs = load_quota_video(video_path)
|
| 238 |
q = 'This is the first frame of a video, describe it in detail.'
|
| 239 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 240 |
img = imgs[0]
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
| 243 |
print(response)
|
| 244 |
responses = [response]
|
| 245 |
images = [img]
|
|
@@ -253,7 +243,10 @@ def generate_slidingcaptioning(video_path):
|
|
| 253 |
new_img.paste(image1, (0, 0))
|
| 254 |
new_img.paste(image2, (0, height+50))
|
| 255 |
query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
| 257 |
response = model_gen(model, query, new_img, hd_num=9)
|
| 258 |
responses.append(response)
|
| 259 |
images.append(new_img)
|
|
@@ -263,29 +256,39 @@ def generate_slidingcaptioning(video_path):
|
|
| 263 |
idx+1, idx*2, txt)
|
| 264 |
query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 265 |
print(query)
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
| 267 |
summ = model_gen(model, query, None, hd_num=16)
|
| 268 |
print(summ)
|
| 269 |
return summ
|
| 270 |
|
| 271 |
|
| 272 |
-
|
| 273 |
def generate_fastcaptioning(video_path):
|
| 274 |
q = 'Here are a few key frames of a video, discribe this video in detail.'
|
| 275 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 276 |
imgs = load_quota_video(video_path)
|
| 277 |
img = img_process(imgs)
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
response = model_gen(model, query, img, hd_num=16,
|
| 280 |
-
|
| 281 |
return response
|
| 282 |
|
| 283 |
|
| 284 |
-
|
| 285 |
def generate_promptrecaptioning(text):
|
| 286 |
q = f'Translate this brief generation prompt into a detailed caption: {text}'
|
| 287 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
| 289 |
response = model_gen(model, query, None)
|
| 290 |
return response
|
| 291 |
|
|
@@ -298,7 +301,6 @@ def save_video_to_local(video_path):
|
|
| 298 |
|
| 299 |
|
| 300 |
with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=block_css) as demo:
|
| 301 |
-
gr.Markdown(title_markdown)
|
| 302 |
state = gr.State()
|
| 303 |
state_ = gr.State()
|
| 304 |
first_run = gr.State()
|
|
@@ -333,7 +335,6 @@ with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=bloc
|
|
| 333 |
textbox_out = gr.Textbox(
|
| 334 |
show_label=False, placeholder="Output", container=False
|
| 335 |
)
|
| 336 |
-
gr.Markdown(learn_more_markdown)
|
| 337 |
|
| 338 |
submit_btn_sc.click(generate_slidingcaptioning, [video], [textbox_out])
|
| 339 |
submit_btn_fc.click(generate_fastcaptioning, [video], [textbox_out])
|
|
|
|
| 11 |
from decord import VideoReader
|
| 12 |
from PIL import Image, ImageDraw, ImageFont
|
| 13 |
from transformers import AutoModel, AutoTokenizer
|
| 14 |
+
import devicetorch
|
| 15 |
+
|
| 16 |
+
#import spaces
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
block_css = """
|
| 20 |
#buttons button {
|
|
|
|
| 22 |
}
|
| 23 |
"""
|
| 24 |
|
| 25 |
+
device = devicetorch.get(torch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
new_path = 'Lin-Chen/ShareCaptioner-Video'
|
| 27 |
tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
|
| 28 |
model = AutoModel.from_pretrained(
|
| 29 |
+
#new_path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
|
| 30 |
+
new_path, torch_dtype=torch.float16, trust_remote_code=True).to(device).eval()
|
| 31 |
+
#model.cuda()
|
| 32 |
+
model.to(device)
|
| 33 |
model.tokenizer = tokenizer
|
| 34 |
|
| 35 |
|
|
|
|
| 104 |
text_embeds = model.encode_text(
|
| 105 |
subtext, add_special_tokens=need_bos)
|
| 106 |
embeds.append(text_embeds)
|
| 107 |
+
#im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
|
| 108 |
+
im_mask.append(torch.zeros(text_embeds.shape[:2]).to(device))
|
| 109 |
need_bos = False
|
| 110 |
if i < len(images):
|
| 111 |
try:
|
|
|
|
| 114 |
image = images[i].convert('RGB')
|
| 115 |
|
| 116 |
image = HD_transform(image, hd_num=hd_num)
|
| 117 |
+
#image = model.vis_processor(image).unsqueeze(0).cuda()
|
| 118 |
+
image = model.vis_processor(image).unsqueeze(0).to(device)
|
| 119 |
image_embeds = model.encode_img(image)
|
| 120 |
print(image_embeds.shape)
|
| 121 |
embeds.append(image_embeds)
|
| 122 |
+
#im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
|
| 123 |
+
im_mask.append(torch.ones(image_embeds.shape[:2]).to(device))
|
| 124 |
pt1 = pts
|
| 125 |
embeds = torch.cat(embeds, dim=1)
|
| 126 |
im_mask = torch.cat(im_mask, dim=1)
|
|
|
|
| 219 |
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
| 220 |
|
| 221 |
|
| 222 |
+
#@spaces.GPU(duration=60)
|
| 223 |
def generate_slidingcaptioning(video_path):
|
| 224 |
imgs = load_quota_video(video_path)
|
| 225 |
q = 'This is the first frame of a video, describe it in detail.'
|
| 226 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 227 |
img = imgs[0]
|
| 228 |
+
if device == "cuda":
|
| 229 |
+
with torch.cuda.amp.autocast():
|
| 230 |
+
response = model_gen(model, query, img, hd_num=9)
|
| 231 |
+
else:
|
| 232 |
+
response = model_gen(model, query, img, hd_num=9)
|
| 233 |
print(response)
|
| 234 |
responses = [response]
|
| 235 |
images = [img]
|
|
|
|
| 243 |
new_img.paste(image1, (0, 0))
|
| 244 |
new_img.paste(image2, (0, height+50))
|
| 245 |
query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 246 |
+
if device == "cuda":
|
| 247 |
+
with torch.cuda.amp.autocast():
|
| 248 |
+
response = model_gen(model, query, new_img, hd_num=9)
|
| 249 |
+
else:
|
| 250 |
response = model_gen(model, query, new_img, hd_num=9)
|
| 251 |
responses.append(response)
|
| 252 |
images.append(new_img)
|
|
|
|
| 256 |
idx+1, idx*2, txt)
|
| 257 |
query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 258 |
print(query)
|
| 259 |
+
if device == "cuda":
|
| 260 |
+
with torch.cuda.amp.autocast():
|
| 261 |
+
summ = model_gen(model, query, None, hd_num=16)
|
| 262 |
+
else:
|
| 263 |
summ = model_gen(model, query, None, hd_num=16)
|
| 264 |
print(summ)
|
| 265 |
return summ
|
| 266 |
|
| 267 |
|
| 268 |
+
#@spaces.GPU(duration=60)
|
| 269 |
def generate_fastcaptioning(video_path):
|
| 270 |
q = 'Here are a few key frames of a video, discribe this video in detail.'
|
| 271 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 272 |
imgs = load_quota_video(video_path)
|
| 273 |
img = img_process(imgs)
|
| 274 |
+
if device == "cuda":
|
| 275 |
+
with torch.cuda.amp.autocast():
|
| 276 |
+
response = model_gen(model, query, img, hd_num=16,
|
| 277 |
+
do_sample=False, beam=3)
|
| 278 |
+
else:
|
| 279 |
response = model_gen(model, query, img, hd_num=16,
|
| 280 |
+
do_sample=False, beam=3)
|
| 281 |
return response
|
| 282 |
|
| 283 |
|
| 284 |
+
#@spaces.GPU(duration=60)
|
| 285 |
def generate_promptrecaptioning(text):
|
| 286 |
q = f'Translate this brief generation prompt into a detailed caption: {text}'
|
| 287 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
| 288 |
+
if device == "cuda":
|
| 289 |
+
with torch.cuda.amp.autocast():
|
| 290 |
+
response = model_gen(model, query, None)
|
| 291 |
+
else:
|
| 292 |
response = model_gen(model, query, None)
|
| 293 |
return response
|
| 294 |
|
|
|
|
| 301 |
|
| 302 |
|
| 303 |
with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=block_css) as demo:
|
|
|
|
| 304 |
state = gr.State()
|
| 305 |
state_ = gr.State()
|
| 306 |
first_run = gr.State()
|
|
|
|
| 335 |
textbox_out = gr.Textbox(
|
| 336 |
show_label=False, placeholder="Output", container=False
|
| 337 |
)
|
|
|
|
| 338 |
|
| 339 |
submit_btn_sc.click(generate_slidingcaptioning, [video], [textbox_out])
|
| 340 |
submit_btn_fc.click(generate_fastcaptioning, [video], [textbox_out])
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
torch==2.1.2
|
| 2 |
-
torchvision==0.16.2
|
| 3 |
transformers==4.37.2
|
| 4 |
tokenizers==0.15.1
|
| 5 |
sentencepiece==0.1.99
|
|
@@ -13,12 +13,12 @@ scikit-learn==1.2.2
|
|
| 13 |
gradio==4.16.0
|
| 14 |
gradio_client==0.8.1
|
| 15 |
openai
|
| 16 |
-
spaces
|
| 17 |
requests
|
| 18 |
httpx==0.24.0
|
| 19 |
uvicorn
|
| 20 |
fastapi
|
| 21 |
-
decord
|
| 22 |
einops==0.6.1
|
| 23 |
einops-exts==0.0.4
|
| 24 |
timm==0.6.13
|
|
|
|
| 1 |
+
#torch==2.1.2
|
| 2 |
+
#torchvision==0.16.2
|
| 3 |
transformers==4.37.2
|
| 4 |
tokenizers==0.15.1
|
| 5 |
sentencepiece==0.1.99
|
|
|
|
| 13 |
gradio==4.16.0
|
| 14 |
gradio_client==0.8.1
|
| 15 |
openai
|
| 16 |
+
#spaces
|
| 17 |
requests
|
| 18 |
httpx==0.24.0
|
| 19 |
uvicorn
|
| 20 |
fastapi
|
| 21 |
+
#decord
|
| 22 |
einops==0.6.1
|
| 23 |
einops-exts==0.0.4
|
| 24 |
timm==0.6.13
|