Spaces:

seemggoel
/

Finetuning_Multimodal_LLM

Runtime error

App Files Files Community

seemggoel commited on Apr 7

Commit

6bcdcce

verified ·

1 Parent(s): bc4ba67

Update model.py

Browse files

Files changed (1) hide show

model.py +6 -6

model.py CHANGED Viewed

@@ -11,10 +11,10 @@ from transformers.cache_utils import DynamicCache, StaticCache
 # Step 3: Set device and default dtype
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-torch.set_default_dtype(torch.float16)
 # Step 4: Load CLIP model and processor
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.float16).to(DEVICE)
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_fast=True)
 # Step 5: Define the MultiModalModel class
@@ -24,12 +24,12 @@ class MultiModalModel(nn.Module):
         self.phi = None  # Will be set after loading the PEFT model
         self.tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
         self.tokenizer.add_special_tokens({"additional_special_tokens": ["[IMG]"], "pad_token": "<pad>"})
-        self.clip = CLIPModel.from_pretrained(clip_model_name, torch_dtype=torch.float16).eval().to(DEVICE)
         image_embedding_dim = self.clip.config.projection_dim
         phi_hidden_size = 3072  # Hardcoded for Phi-3 mini
         self.image_projection = nn.Sequential(
-            nn.Linear(image_embedding_dim, phi_hidden_size, dtype=torch.float16),
-            nn.LayerNorm(phi_hidden_size, dtype=torch.float16),
             nn.Dropout(0.1)
         ).to(DEVICE)
         nn.init.xavier_uniform_(self.image_projection[0].weight, gain=1.0)
@@ -37,7 +37,7 @@ class MultiModalModel(nn.Module):
     def forward(self, text_input_ids, attention_mask=None, image_embedding=None):
         image_embedding = torch.clamp(image_embedding, min=-1e4, max=1e4)
-        image_embedding = F.normalize(image_embedding, dim=-1, eps=1e-5).to(torch.float16)
         with torch.no_grad():
             self.image_projection[0].weight.clamp_(-1.0, 1.0)
             self.image_projection[0].bias.clamp_(-1.0, 1.0)

 # Step 3: Set device and default dtype
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.set_default_dtype(torch.float32 if DEVICE.type == "cpu" else torch.float16)
 # Step 4: Load CLIP model and processor
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.float32 if DEVICE.type == "cpu" else torch.float16).to(DEVICE)
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_fast=True)
 # Step 5: Define the MultiModalModel class
         self.phi = None  # Will be set after loading the PEFT model
         self.tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
         self.tokenizer.add_special_tokens({"additional_special_tokens": ["[IMG]"], "pad_token": "<pad>"})
+        self.clip = CLIPModel.from_pretrained(clip_model_name, torch_dtype=torch.float32 if DEVICE.type == "cpu" else torch.float16).eval().to(DEVICE)
         image_embedding_dim = self.clip.config.projection_dim
         phi_hidden_size = 3072  # Hardcoded for Phi-3 mini
         self.image_projection = nn.Sequential(
+            nn.Linear(image_embedding_dim, phi_hidden_size, dtype=torch.float32 if DEVICE.type == "cpu" else torch.float16),
+            nn.LayerNorm(phi_hidden_size, dtype=torch.float32 if DEVICE.type == "cpu" else torch.float16),
             nn.Dropout(0.1)
         ).to(DEVICE)
         nn.init.xavier_uniform_(self.image_projection[0].weight, gain=1.0)
     def forward(self, text_input_ids, attention_mask=None, image_embedding=None):
         image_embedding = torch.clamp(image_embedding, min=-1e4, max=1e4)
+        image_embedding = F.normalize(image_embedding, dim=-1, eps=1e-5).to(torch.float32 if DEVICE.type == "cpu" else torch.float16)
         with torch.no_grad():
             self.image_projection[0].weight.clamp_(-1.0, 1.0)
             self.image_projection[0].bias.clamp_(-1.0, 1.0)