Spaces:

Tonic
/

Pixtral

Paused

App Files Files Community

Tonic commited on Sep 11, 2024

Commit

f570b2f

verified ·

1 Parent(s): 9a64677

add snapshot download

Browse files

Files changed (1) hide show

app.py +26 -17

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -6,20 +7,20 @@ import json
 import gradio as gr
 from PIL import Image
 import numpy as np
 from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-from huggingface_hub import snapshot_download
-import Spaces
 # Download model files
 model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
-with open('PARAMS.json', 'r') as f:
     params = json.load(f)
-with open('TEKKEN.json', 'r') as f:
     tokenizer_config = json.load(f)
 class GELU(nn.Module):
@@ -28,6 +29,7 @@ class GELU(nn.Module):
         self.linear = nn.Linear(dim_in, dim_out, bias=bias)
         self.approximate = approximate
     def forward(self, x):
         if self.approximate == 'tanh':
             return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -46,6 +48,7 @@ class Rope2D(nn.Module):
         self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
         self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
     def forward(self, x, seq_len=None):
         if seq_len > self.max_seq_len_cached:
             self.max_seq_len_cached = seq_len
@@ -69,6 +72,7 @@ class VisionEncoder(nn.Module):
         self.norm = nn.LayerNorm(config['hidden_size'])
         self.gelu = GELU(config['hidden_size'], config['hidden_size'])
     def forward(self, pixel_values):
         x = self.embed(pixel_values)
         b, c, h, w = x.shape
@@ -86,30 +90,34 @@ class PixtralModel(nn.Module):
         self.vision_encoder = VisionEncoder(params['vision_encoder'])
         # Add text generation components here
     def forward(self, image):
         vision_output = self.vision_encoder(image)
         # Add text generation logic here
         return vision_output
-# Initialize the model
-model = PixtralModel(params)
-# Load the model weights
-with safe_open('consolidated.safetensors', framework="pt", device="cpu") as f:
-    for name, param in model.named_parameters():
-        if name in f.keys():
-            param.data = f.get_tensor(name)
-model.eval()
-# Initialize the tokenizer
 tokenizer = MistralTokenizer.from_model("pixtral")
 def process_image_and_text(image, prompt):
     # Prepare the image
     image = image.convert('RGB')
     image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
     image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
     # Tokenize the input
     tokenized = tokenizer.encode_chat_completion(
@@ -169,4 +177,5 @@ with gr.Blocks() as demo:
     gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
     gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
-demo.launch()

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import gradio as gr
 from PIL import Image
 import numpy as np
+from huggingface_hub import snapshot_download
 from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+import spaces
 # Download model files
 model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
+# Load model parameters and tokenizer configuration
+with open(f'{model_path}/params.json', 'r') as f:
     params = json.load(f)
+with open(f'{model_path}/tekken.json', 'r') as f:
     tokenizer_config = json.load(f)
 class GELU(nn.Module):
         self.linear = nn.Linear(dim_in, dim_out, bias=bias)
         self.approximate = approximate
+    @spaces.GPU
     def forward(self, x):
         if self.approximate == 'tanh':
             return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
         self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
         self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+    @spaces.GPU
     def forward(self, x, seq_len=None):
         if seq_len > self.max_seq_len_cached:
             self.max_seq_len_cached = seq_len
         self.norm = nn.LayerNorm(config['hidden_size'])
         self.gelu = GELU(config['hidden_size'], config['hidden_size'])
+    @spaces.GPU
     def forward(self, pixel_values):
         x = self.embed(pixel_values)
         b, c, h, w = x.shape
         self.vision_encoder = VisionEncoder(params['vision_encoder'])
         # Add text generation components here
+    @spaces.GPU
     def forward(self, image):
         vision_output = self.vision_encoder(image)
         # Add text generation logic here
         return vision_output
+@spaces.GPU
+def load_model(params, model_path):
+    model = PixtralModel(params)
+    with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cuda") as f:
+        for name, param in model.named_parameters():
+            if name in f.keys():
+                param.data = f.get_tensor(name)
+    model.eval()
+    return model.cuda()
+model = load_model(params, model_path)
 tokenizer = MistralTokenizer.from_model("pixtral")
+@spaces.GPU
 def process_image_and_text(image, prompt):
     # Prepare the image
     image = image.convert('RGB')
     image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
     image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
+    image_tensor = image_tensor.cuda()
     # Tokenize the input
     tokenized = tokenizer.encode_chat_completion(
     gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
     gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
+if __name__ == "__main__":
+    demo.launch()