Spaces:
Runtime error
Runtime error
Update model.py
Browse files
model.py
CHANGED
|
@@ -11,10 +11,10 @@ from transformers.cache_utils import DynamicCache, StaticCache
|
|
| 11 |
|
| 12 |
# Step 3: Set device and default dtype
|
| 13 |
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 14 |
-
torch.set_default_dtype(torch.float16)
|
| 15 |
|
| 16 |
# Step 4: Load CLIP model and processor
|
| 17 |
-
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.float16).to(DEVICE)
|
| 18 |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_fast=True)
|
| 19 |
|
| 20 |
# Step 5: Define the MultiModalModel class
|
|
@@ -24,12 +24,12 @@ class MultiModalModel(nn.Module):
|
|
| 24 |
self.phi = None # Will be set after loading the PEFT model
|
| 25 |
self.tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
|
| 26 |
self.tokenizer.add_special_tokens({"additional_special_tokens": ["[IMG]"], "pad_token": "<pad>"})
|
| 27 |
-
self.clip = CLIPModel.from_pretrained(clip_model_name, torch_dtype=torch.float16).eval().to(DEVICE)
|
| 28 |
image_embedding_dim = self.clip.config.projection_dim
|
| 29 |
phi_hidden_size = 3072 # Hardcoded for Phi-3 mini
|
| 30 |
self.image_projection = nn.Sequential(
|
| 31 |
-
nn.Linear(image_embedding_dim, phi_hidden_size, dtype=torch.float16),
|
| 32 |
-
nn.LayerNorm(phi_hidden_size, dtype=torch.float16),
|
| 33 |
nn.Dropout(0.1)
|
| 34 |
).to(DEVICE)
|
| 35 |
nn.init.xavier_uniform_(self.image_projection[0].weight, gain=1.0)
|
|
@@ -37,7 +37,7 @@ class MultiModalModel(nn.Module):
|
|
| 37 |
|
| 38 |
def forward(self, text_input_ids, attention_mask=None, image_embedding=None):
|
| 39 |
image_embedding = torch.clamp(image_embedding, min=-1e4, max=1e4)
|
| 40 |
-
image_embedding = F.normalize(image_embedding, dim=-1, eps=1e-5).to(torch.float16)
|
| 41 |
with torch.no_grad():
|
| 42 |
self.image_projection[0].weight.clamp_(-1.0, 1.0)
|
| 43 |
self.image_projection[0].bias.clamp_(-1.0, 1.0)
|
|
|
|
| 11 |
|
| 12 |
# Step 3: Set device and default dtype
|
| 13 |
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 14 |
+
torch.set_default_dtype(torch.float32 if DEVICE.type == "cpu" else torch.float16)
|
| 15 |
|
| 16 |
# Step 4: Load CLIP model and processor
|
| 17 |
+
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.float32 if DEVICE.type == "cpu" else torch.float16).to(DEVICE)
|
| 18 |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_fast=True)
|
| 19 |
|
| 20 |
# Step 5: Define the MultiModalModel class
|
|
|
|
| 24 |
self.phi = None # Will be set after loading the PEFT model
|
| 25 |
self.tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
|
| 26 |
self.tokenizer.add_special_tokens({"additional_special_tokens": ["[IMG]"], "pad_token": "<pad>"})
|
| 27 |
+
self.clip = CLIPModel.from_pretrained(clip_model_name, torch_dtype=torch.float32 if DEVICE.type == "cpu" else torch.float16).eval().to(DEVICE)
|
| 28 |
image_embedding_dim = self.clip.config.projection_dim
|
| 29 |
phi_hidden_size = 3072 # Hardcoded for Phi-3 mini
|
| 30 |
self.image_projection = nn.Sequential(
|
| 31 |
+
nn.Linear(image_embedding_dim, phi_hidden_size, dtype=torch.float32 if DEVICE.type == "cpu" else torch.float16),
|
| 32 |
+
nn.LayerNorm(phi_hidden_size, dtype=torch.float32 if DEVICE.type == "cpu" else torch.float16),
|
| 33 |
nn.Dropout(0.1)
|
| 34 |
).to(DEVICE)
|
| 35 |
nn.init.xavier_uniform_(self.image_projection[0].weight, gain=1.0)
|
|
|
|
| 37 |
|
| 38 |
def forward(self, text_input_ids, attention_mask=None, image_embedding=None):
|
| 39 |
image_embedding = torch.clamp(image_embedding, min=-1e4, max=1e4)
|
| 40 |
+
image_embedding = F.normalize(image_embedding, dim=-1, eps=1e-5).to(torch.float32 if DEVICE.type == "cpu" else torch.float16)
|
| 41 |
with torch.no_grad():
|
| 42 |
self.image_projection[0].weight.clamp_(-1.0, 1.0)
|
| 43 |
self.image_projection[0].bias.clamp_(-1.0, 1.0)
|