ShareCaptioner-Video

Runtime error

App Files Files Community

cocktailpeanut commited on Jun 19, 2024

Commit

9dfb729

1 Parent(s): c32e4c7

update

Browse files

Files changed (2) hide show

app.py +40 -39
requirements.txt +4 -4

app.py CHANGED Viewed

@@ -11,23 +11,10 @@ import torchvision.transforms as transforms
 from decord import VideoReader
 from PIL import Image, ImageDraw, ImageFont
 from transformers import AutoModel, AutoTokenizer
-import spaces
-title_markdown = ("""
-<div style="display: flex; justify-content: flex-start; align-items: center; text-align: center;">
-  <div style="margin-right: 20px; display: flex; align-items: center;">
-    <a href="https://github.com/ShareGPT4Omni/ShareGPT4Video" style="text-decoration: none; display: flex; align-items: center;">
-      <img src="https://raw.githubusercontent.com/ShareGPT4V/ShareGPT4V-Resources/master/images/share4video_tight.png" alt="ShareGPT4Video🚀" style="max-width: 120px; height: auto;">
-    </a>
-  </div>
-  <div>
-    <h1>ShareGPT4Video: Improving Video Understanding and Generation with Better Captions</h1>
-    <h5 style="margin: 0;">If you like our project, please give us a star ✨ on Github for the latest update.</h5>
-    <h5 style="margin: 0;"> <a href="https://sharegpt4video.github.io/">[Project Page]</a> <a href="https://github.com/ShareGPT4Omni/ShareGPT4Video">[Code]</a> <a href="https://arxiv.org/abs/2406.04325v1">[Paper]</a>
-  </div>
-</div>
-""")
 block_css = """
 #buttons button {
@@ -35,17 +22,14 @@ block_css = """
 }
 """
-learn_more_markdown = ("""
-### License
-The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
-""")
 new_path = 'Lin-Chen/ShareCaptioner-Video'
 tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
 model = AutoModel.from_pretrained(
-    new_path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
-model.cuda()
 model.tokenizer = tokenizer
@@ -120,7 +104,8 @@ def model_gen(model, text, images, need_bos=True, hd_num=25, max_new_token=2048,
             text_embeds = model.encode_text(
                 subtext, add_special_tokens=need_bos)
             embeds.append(text_embeds)
-            im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
             need_bos = False
         if i < len(images):
             try:
@@ -129,11 +114,13 @@ def model_gen(model, text, images, need_bos=True, hd_num=25, max_new_token=2048,
                 image = images[i].convert('RGB')
             image = HD_transform(image, hd_num=hd_num)
-            image = model.vis_processor(image).unsqueeze(0).cuda()
             image_embeds = model.encode_img(image)
             print(image_embeds.shape)
             embeds.append(image_embeds)
-            im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
         pt1 = pts
     embeds = torch.cat(embeds, dim=1)
     im_mask = torch.cat(im_mask, dim=1)
@@ -232,14 +219,17 @@ def encode_resized_image(image_path, max_size=1024):
             return base64.b64encode(buffer.getvalue()).decode('utf-8')
-@spaces.GPU(duration=60)
 def generate_slidingcaptioning(video_path):
     imgs = load_quota_video(video_path)
     q = 'This is the first frame of a video, describe it in detail.'
     query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
     img = imgs[0]
-    with torch.cuda.amp.autocast():
-        response = model_gen(model, query, img, hd_num=9)
     print(response)
     responses = [response]
     images = [img]
@@ -253,7 +243,10 @@ def generate_slidingcaptioning(video_path):
         new_img.paste(image1, (0, 0))
         new_img.paste(image2, (0, height+50))
         query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
-        with torch.cuda.amp.autocast():
             response = model_gen(model, query, new_img, hd_num=9)
         responses.append(response)
         images.append(new_img)
@@ -263,29 +256,39 @@ def generate_slidingcaptioning(video_path):
             idx+1, idx*2, txt)
     query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
     print(query)
-    with torch.cuda.amp.autocast():
         summ = model_gen(model, query, None, hd_num=16)
     print(summ)
     return summ
-@spaces.GPU(duration=60)
 def generate_fastcaptioning(video_path):
     q = 'Here are a few key frames of a video, discribe this video in detail.'
     query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
     imgs = load_quota_video(video_path)
     img = img_process(imgs)
-    with torch.cuda.amp.autocast():
         response = model_gen(model, query, img, hd_num=16,
-                             do_sample=False, beam=3)
     return response
-@spaces.GPU(duration=60)
 def generate_promptrecaptioning(text):
     q = f'Translate this brief generation prompt into a detailed caption: {text}'
     query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
-    with torch.cuda.amp.autocast():
         response = model_gen(model, query, None)
     return response
@@ -298,7 +301,6 @@ def save_video_to_local(video_path):
 with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=block_css) as demo:
-    gr.Markdown(title_markdown)
     state = gr.State()
     state_ = gr.State()
     first_run = gr.State()
@@ -333,7 +335,6 @@ with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=bloc
                 textbox_out = gr.Textbox(
                     show_label=False, placeholder="Output", container=False
                 )
-    gr.Markdown(learn_more_markdown)
     submit_btn_sc.click(generate_slidingcaptioning, [video], [textbox_out])
     submit_btn_fc.click(generate_fastcaptioning, [video], [textbox_out])

 from decord import VideoReader
 from PIL import Image, ImageDraw, ImageFont
 from transformers import AutoModel, AutoTokenizer
+import devicetorch
+#import spaces
 block_css = """
 #buttons button {
 }
 """
+device = devicetorch.get(torch)
 new_path = 'Lin-Chen/ShareCaptioner-Video'
 tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
 model = AutoModel.from_pretrained(
+    #new_path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
+    new_path, torch_dtype=torch.float16, trust_remote_code=True).to(device).eval()
+#model.cuda()
+model.to(device)
 model.tokenizer = tokenizer
             text_embeds = model.encode_text(
                 subtext, add_special_tokens=need_bos)
             embeds.append(text_embeds)
+            #im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
+            im_mask.append(torch.zeros(text_embeds.shape[:2]).to(device))
             need_bos = False
         if i < len(images):
             try:
                 image = images[i].convert('RGB')
             image = HD_transform(image, hd_num=hd_num)
+            #image = model.vis_processor(image).unsqueeze(0).cuda()
+            image = model.vis_processor(image).unsqueeze(0).to(device)
             image_embeds = model.encode_img(image)
             print(image_embeds.shape)
             embeds.append(image_embeds)
+            #im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
+            im_mask.append(torch.ones(image_embeds.shape[:2]).to(device))
         pt1 = pts
     embeds = torch.cat(embeds, dim=1)
     im_mask = torch.cat(im_mask, dim=1)
             return base64.b64encode(buffer.getvalue()).decode('utf-8')
+#@spaces.GPU(duration=60)
 def generate_slidingcaptioning(video_path):
     imgs = load_quota_video(video_path)
     q = 'This is the first frame of a video, describe it in detail.'
     query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
     img = imgs[0]
+    if device == "cuda":
+      with torch.cuda.amp.autocast():
+          response = model_gen(model, query, img, hd_num=9)
+    else:
+      response = model_gen(model, query, img, hd_num=9)
     print(response)
     responses = [response]
     images = [img]
         new_img.paste(image1, (0, 0))
         new_img.paste(image2, (0, height+50))
         query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+        if device == "cuda":
+            with torch.cuda.amp.autocast():
+                response = model_gen(model, query, new_img, hd_num=9)
+        else:
             response = model_gen(model, query, new_img, hd_num=9)
         responses.append(response)
         images.append(new_img)
             idx+1, idx*2, txt)
     query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
     print(query)
+    if device == "cuda":
+        with torch.cuda.amp.autocast():
+            summ = model_gen(model, query, None, hd_num=16)
+    else:
         summ = model_gen(model, query, None, hd_num=16)
     print(summ)
     return summ
+#@spaces.GPU(duration=60)
 def generate_fastcaptioning(video_path):
     q = 'Here are a few key frames of a video, discribe this video in detail.'
     query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
     imgs = load_quota_video(video_path)
     img = img_process(imgs)
+    if device == "cuda":
+        with torch.cuda.amp.autocast():
+            response = model_gen(model, query, img, hd_num=16,
+                                do_sample=False, beam=3)
+    else:
         response = model_gen(model, query, img, hd_num=16,
+                            do_sample=False, beam=3)
     return response
+#@spaces.GPU(duration=60)
 def generate_promptrecaptioning(text):
     q = f'Translate this brief generation prompt into a detailed caption: {text}'
     query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+    if device == "cuda":
+        with torch.cuda.amp.autocast():
+            response = model_gen(model, query, None)
+    else:
         response = model_gen(model, query, None)
     return response
 with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=block_css) as demo:
     state = gr.State()
     state_ = gr.State()
     first_run = gr.State()
                 textbox_out = gr.Textbox(
                     show_label=False, placeholder="Output", container=False
                 )
     submit_btn_sc.click(generate_slidingcaptioning, [video], [textbox_out])
     submit_btn_fc.click(generate_fastcaptioning, [video], [textbox_out])

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-torch==2.1.2
-torchvision==0.16.2
 transformers==4.37.2
 tokenizers==0.15.1
 sentencepiece==0.1.99
@@ -13,12 +13,12 @@ scikit-learn==1.2.2
 gradio==4.16.0
 gradio_client==0.8.1
 openai
-spaces
 requests
 httpx==0.24.0
 uvicorn
 fastapi
-decord
 einops==0.6.1
 einops-exts==0.0.4
 timm==0.6.13

+#torch==2.1.2
+#torchvision==0.16.2
 transformers==4.37.2
 tokenizers==0.15.1
 sentencepiece==0.1.99
 gradio==4.16.0
 gradio_client==0.8.1
 openai
+#spaces
 requests
 httpx==0.24.0
 uvicorn
 fastapi
+#decord
 einops==0.6.1
 einops-exts==0.0.4
 timm==0.6.13