Spaces:
Paused
Paused
add snapshot download
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import torch
|
| 2 |
import torch.nn as nn
|
| 3 |
import torch.nn.functional as F
|
|
@@ -6,20 +7,20 @@ import json
|
|
| 6 |
import gradio as gr
|
| 7 |
from PIL import Image
|
| 8 |
import numpy as np
|
|
|
|
| 9 |
from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
|
| 10 |
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
| 11 |
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
| 12 |
-
|
| 13 |
-
import Spaces
|
| 14 |
-
|
| 15 |
|
| 16 |
# Download model files
|
| 17 |
model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
|
| 18 |
|
| 19 |
-
|
|
|
|
| 20 |
params = json.load(f)
|
| 21 |
|
| 22 |
-
with open('
|
| 23 |
tokenizer_config = json.load(f)
|
| 24 |
|
| 25 |
class GELU(nn.Module):
|
|
@@ -28,6 +29,7 @@ class GELU(nn.Module):
|
|
| 28 |
self.linear = nn.Linear(dim_in, dim_out, bias=bias)
|
| 29 |
self.approximate = approximate
|
| 30 |
|
|
|
|
| 31 |
def forward(self, x):
|
| 32 |
if self.approximate == 'tanh':
|
| 33 |
return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
|
@@ -46,6 +48,7 @@ class Rope2D(nn.Module):
|
|
| 46 |
self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
| 47 |
self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
| 48 |
|
|
|
|
| 49 |
def forward(self, x, seq_len=None):
|
| 50 |
if seq_len > self.max_seq_len_cached:
|
| 51 |
self.max_seq_len_cached = seq_len
|
|
@@ -69,6 +72,7 @@ class VisionEncoder(nn.Module):
|
|
| 69 |
self.norm = nn.LayerNorm(config['hidden_size'])
|
| 70 |
self.gelu = GELU(config['hidden_size'], config['hidden_size'])
|
| 71 |
|
|
|
|
| 72 |
def forward(self, pixel_values):
|
| 73 |
x = self.embed(pixel_values)
|
| 74 |
b, c, h, w = x.shape
|
|
@@ -86,30 +90,34 @@ class PixtralModel(nn.Module):
|
|
| 86 |
self.vision_encoder = VisionEncoder(params['vision_encoder'])
|
| 87 |
# Add text generation components here
|
| 88 |
|
|
|
|
| 89 |
def forward(self, image):
|
| 90 |
vision_output = self.vision_encoder(image)
|
| 91 |
# Add text generation logic here
|
| 92 |
return vision_output
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
with safe_open('consolidated.safetensors', framework="pt", device="
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
model.eval()
|
|
|
|
| 104 |
|
| 105 |
-
|
| 106 |
tokenizer = MistralTokenizer.from_model("pixtral")
|
| 107 |
|
|
|
|
| 108 |
def process_image_and_text(image, prompt):
|
| 109 |
# Prepare the image
|
| 110 |
image = image.convert('RGB')
|
| 111 |
image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
|
| 112 |
image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
|
|
|
|
| 113 |
|
| 114 |
# Tokenize the input
|
| 115 |
tokenized = tokenizer.encode_chat_completion(
|
|
@@ -169,4 +177,5 @@ with gr.Blocks() as demo:
|
|
| 169 |
gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
|
| 170 |
gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
|
| 171 |
|
| 172 |
-
|
|
|
|
|
|
| 1 |
+
|
| 2 |
import torch
|
| 3 |
import torch.nn as nn
|
| 4 |
import torch.nn.functional as F
|
|
|
|
| 7 |
import gradio as gr
|
| 8 |
from PIL import Image
|
| 9 |
import numpy as np
|
| 10 |
+
from huggingface_hub import snapshot_download
|
| 11 |
from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
|
| 12 |
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
| 13 |
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
| 14 |
+
import spaces
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# Download model files
|
| 17 |
model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
|
| 18 |
|
| 19 |
+
# Load model parameters and tokenizer configuration
|
| 20 |
+
with open(f'{model_path}/params.json', 'r') as f:
|
| 21 |
params = json.load(f)
|
| 22 |
|
| 23 |
+
with open(f'{model_path}/tekken.json', 'r') as f:
|
| 24 |
tokenizer_config = json.load(f)
|
| 25 |
|
| 26 |
class GELU(nn.Module):
|
|
|
|
| 29 |
self.linear = nn.Linear(dim_in, dim_out, bias=bias)
|
| 30 |
self.approximate = approximate
|
| 31 |
|
| 32 |
+
@spaces.GPU
|
| 33 |
def forward(self, x):
|
| 34 |
if self.approximate == 'tanh':
|
| 35 |
return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
|
|
|
| 48 |
self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
| 49 |
self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
| 50 |
|
| 51 |
+
@spaces.GPU
|
| 52 |
def forward(self, x, seq_len=None):
|
| 53 |
if seq_len > self.max_seq_len_cached:
|
| 54 |
self.max_seq_len_cached = seq_len
|
|
|
|
| 72 |
self.norm = nn.LayerNorm(config['hidden_size'])
|
| 73 |
self.gelu = GELU(config['hidden_size'], config['hidden_size'])
|
| 74 |
|
| 75 |
+
@spaces.GPU
|
| 76 |
def forward(self, pixel_values):
|
| 77 |
x = self.embed(pixel_values)
|
| 78 |
b, c, h, w = x.shape
|
|
|
|
| 90 |
self.vision_encoder = VisionEncoder(params['vision_encoder'])
|
| 91 |
# Add text generation components here
|
| 92 |
|
| 93 |
+
@spaces.GPU
|
| 94 |
def forward(self, image):
|
| 95 |
vision_output = self.vision_encoder(image)
|
| 96 |
# Add text generation logic here
|
| 97 |
return vision_output
|
| 98 |
|
| 99 |
+
@spaces.GPU
|
| 100 |
+
def load_model(params, model_path):
|
| 101 |
+
model = PixtralModel(params)
|
| 102 |
+
|
| 103 |
+
with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cuda") as f:
|
| 104 |
+
for name, param in model.named_parameters():
|
| 105 |
+
if name in f.keys():
|
| 106 |
+
param.data = f.get_tensor(name)
|
| 107 |
+
|
| 108 |
+
model.eval()
|
| 109 |
+
return model.cuda()
|
| 110 |
|
| 111 |
+
model = load_model(params, model_path)
|
| 112 |
tokenizer = MistralTokenizer.from_model("pixtral")
|
| 113 |
|
| 114 |
+
@spaces.GPU
|
| 115 |
def process_image_and_text(image, prompt):
|
| 116 |
# Prepare the image
|
| 117 |
image = image.convert('RGB')
|
| 118 |
image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
|
| 119 |
image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
|
| 120 |
+
image_tensor = image_tensor.cuda()
|
| 121 |
|
| 122 |
# Tokenize the input
|
| 123 |
tokenized = tokenizer.encode_chat_completion(
|
|
|
|
| 177 |
gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
|
| 178 |
gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
|
| 179 |
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
demo.launch()
|