Spaces:

seemggoel
/

Finetuning_Multimodal_LLM

Runtime error

App Files Files Community

Finetuning_Multimodal_LLM / app.py

seemggoel

Update app.py

c9aba3a verified 8 months ago

raw

history blame

11.1 kB

	# # Step 2: Import necessary libraries
	# import gradio as gr
	# from PIL import Image
	# from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM
	# from peft import PeftConfig, PeftModel
	# import torch
	# import torch.nn as nn
	# import torch.nn.functional as F
	# from transformers.cache_utils import DynamicCache
	# import json
	# import os
	# from peft import PeftConfig

	# # Step 3: Set device and default dtype
	# DEVICE = torch.device("cpu") # Explicitly set to CPU
	# torch.set_default_dtype(torch.float32) # Use float32 for CPU compatibility (float16 is less reliable on CPU)

	# # Step 4: Load CLIP model and processor
	# clip_model = CLIPModel.from_pretrained(
	# "openai/clip-vit-base-patch32",
	# torch_dtype=torch.float32 # Use float32 instead of float16
	# ).to(DEVICE)
	# clip_processor = CLIPProcessor.from_pretrained(
	# "openai/clip-vit-base-patch32",
	# use_fast=True
	# )

	# # Step 5: Define the MultiModalModel class
	# class MultiModalModel(nn.Module):
	# def __init__(self, phi_model_name="microsoft/phi-3-mini-4k-instruct", clip_model_name="openai/clip-vit-base-patch32"):
	# super().__init__()
	# self.phi = None # Will be set after loading the PEFT model
	# self.tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
	# self.tokenizer.add_special_tokens({"additional_special_tokens": ["[IMG]"], "pad_token": "<pad>"})
	# self.clip = CLIPModel.from_pretrained(
	# clip_model_name,
	# torch_dtype=torch.float32 # Use float32 for CPU
	# ).eval().to(DEVICE)
	# image_embedding_dim = self.clip.config.projection_dim
	# phi_hidden_size = 3072 # Hardcoded for Phi-3 mini
	# self.image_projection = nn.Sequential(
	# nn.Linear(image_embedding_dim, phi_hidden_size, dtype=torch.float32), # Use float32
	# nn.LayerNorm(phi_hidden_size, dtype=torch.float32),
	# nn.Dropout(0.1)
	# ).to(DEVICE)
	# nn.init.xavier_uniform_(self.image_projection[0].weight, gain=1.0)
	# nn.init.zeros_(self.image_projection[0].bias)

	# def forward(self, text_input_ids, attention_mask=None, image_embedding=None):
	# image_embedding = torch.clamp(image_embedding, min=-1e4, max=1e4)
	# image_embedding = F.normalize(image_embedding, dim=-1, eps=1e-5).to(torch.float32) # Use float32
	# with torch.no_grad():
	# self.image_projection[0].weight.clamp_(-1.0, 1.0)
	# self.image_projection[0].bias.clamp_(-1.0, 1.0)
	# projected_image = 1.0 * self.image_projection(image_embedding)
	# projected_image = torch.clamp(projected_image, min=-1e4, max=1e4)
	# if torch.isnan(projected_image).any() or torch.isinf(projected_image).any():
	# print("Warning: Projected image contains NaN or Inf values after clamping, replacing with zeros")
	# projected_image = torch.where(
	# torch.logical_or(torch.isnan(projected_image), torch.isinf(projected_image)),
	# torch.zeros_like(projected_image),
	# projected_image
	# )
	# if projected_image.dim() == 2:
	# projected_image = projected_image.unsqueeze(1)
	# text_embeddings = self.phi.get_input_embeddings()(text_input_ids)
	# fused_embeddings = text_embeddings.clone()
	# img_token_id = self.tokenizer.convert_tokens_to_ids("[IMG]")
	# img_token_mask = (text_input_ids == img_token_id)
	# for i in range(fused_embeddings.shape[0]):
	# img_positions = img_token_mask[i].nonzero(as_tuple=True)[0]
	# if img_positions.numel() > 0:
	# fused_embeddings[i, img_positions[0], :] = projected_image[i, 0, :]
	# if torch.isnan(fused_embeddings).any() or torch.isinf(fused_embeddings).any():
	# print("Warning: Fused embeddings contain NaN or Inf values, replacing with zeros")
	# fused_embeddings = torch.where(
	# torch.logical_or(torch.isnan(fused_embeddings), torch.isinf(fused_embeddings)),
	# torch.zeros_like(fused_embeddings),
	# fused_embeddings
	# )
	# return fused_embeddings

	# # Step 6: Load the fine-tuned model weights from Epoch_0
	# def load_model():
	# # 1. Load PEFT Config
	# peft_model_id = "finalmodel_v2" # Path to the saved PEFT directory
	# # Load the config.json file
	# config_path = os.path.join(peft_model_id, "config.json")
	# with open(config_path, "r") as f:
	# peft_config_dict = json.load(f)

	# # Check if 'eva_config' exists and remove it if it's not needed
	# if "eva_config" in peft_config_dict:
	# print("Found 'eva_config' in the config. Removing it...")
	# del peft_config_dict["eva_config"]

	# # Save the modified config back
	# with open(config_path, "w") as f:
	# json.dump(peft_config_dict, f, indent=2)

	# # Now load the config with PeftConfig
	# config = PeftConfig.from_pretrained(peft_model_id) # Use the config to determine the base model

	# attn_implementation = "eager"
	# cache = DynamicCache()

	# # Load base model without quantization (CPU-compatible)
	# base_model = AutoModelForCausalLM.from_pretrained(
	# config.base_model_name_or_path,
	# return_dict=True,
	# device_map="cpu", # Explicitly set to CPU
	# trust_remote_code=False,
	# torch_dtype=torch.float32, # Use float32 for CPU
	# attn_implementation="eager"
	# )
	# base_model.gradient_checkpointing_enable()
	# peft_model = PeftModel.from_pretrained(base_model, peft_model_id)
	# tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
	# special_tokens = {"additional_special_tokens": ["[IMG]"], "pad_token": "<pad>"}
	# tokenizer.add_special_tokens(special_tokens)
	# peft_model.resize_token_embeddings(len(tokenizer))
	# tokenizer.pad_token = tokenizer.eos_token

	# model = MultiModalModel(phi_model_name=config.base_model_name_or_path)
	# model.phi = peft_model
	# model.to(DEVICE)
	# model.eval()
	# return model, tokenizer

	# # Load the model
	# model, tokenizer = load_model()

	# # Step 7: Simple captioning function (for demonstration, assuming you have one)
	# def generate_caption(image, model, tokenizer):
	# try:
	# if not isinstance(image, Image.Image):
	# return "Error: Input must be a valid image."
	# if image.mode != "RGB":
	# image = image.convert("RGB")

	# # Process image with CLIP
	# image_inputs = clip_processor(images=image, return_tensors="pt").to(DEVICE)
	# with torch.no_grad():
	# image_embedding = clip_model.get_image_features(**image_inputs).to(torch.float32)

	# # Prepare prompt
	# prompt = "Caption this image: [IMG]"
	# inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
	# input_ids = inputs["input_ids"].to(DEVICE)
	# attention_mask = inputs["attention_mask"].to(DEVICE)

	# # Generate fused embeddings
	# with torch.no_grad():
	# fused_embedding = model(input_ids, attention_mask, image_embedding)

	# # Generate caption
	# with torch.no_grad():
	# generated_ids = model.phi.generate(
	# inputs_embeds=fused_embedding,
	# attention_mask=attention_mask,
	# max_new_tokens=50,
	# min_length=10,
	# num_beams=3, # Reduced for CPU speed
	# repetition_penalty=1.2,
	# do_sample=False
	# )
	# caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
	# return caption.strip()

	# except Exception as e:
	# return f"Error generating caption: {str(e)}"

	# # Step 8: Gradio interface (same as your previous request)
	# with gr.Blocks(title="CPU-Based Image Captioning") as interface:
	# with gr.Row():
	# with gr.Column():
	# image_input = gr.Image(type="pil", label="Upload an Image", sources=["upload"])
	# with gr.Column():
	# gr.Markdown("The image depicts a bustling cityscape at dusk, with towering skyscrapers reflecting the orange and pink hues of the setting sun. Streetlights are lined with a variety of vehicles, including cars, buses, and bicycles. Pedestrians can be seen walking along the sidewalks, some carrying shopping bags, while others are engrossed in their smartphones. The urban environment, casting a warm glow on the scene.")
	# caption_output = gr.Textbox(label="Caption:", placeholder="A vibrant cityscape at dusk, with skyscrapers reflecting the sunset", lines=2)

	# with gr.Row():
	# clear_button = gr.Button("Clear")
	# submit_button = gr.Button("Submit", variant="primary")

	# def update_caption(image):
	# if image is None:
	# return "Please upload an image."
	# caption = generate_caption(image, model, tokenizer)
	# return caption

	# submit_button.click(
	# fn=update_caption,
	# inputs=image_input,
	# outputs=caption_output
	# )

	# clear_button.click(
	# fn=lambda: "",
	# outputs=caption_output
	# )

	# interface.launch(debug=True)
	#!/usr/bin/env python

	# #!/usr/bin/env python

	# import os
	# import torch
	# from multimodal_app import create_interface

	# # Optional: Set torch threads to limit CPU usage
	# torch.set_num_threads(4)

	# # Create and launch the interface
	# demo = create_interface()
	# demo.queue() # Enable queuing for better handling of multiple requests
	# demo.launch()
	#!/usr/bin/env python

	import os
	import torch
	import sys
	import argparse

	# Set environment variables for better compatibility
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	# Set torch threads to limit CPU usage
	torch.set_num_threads(4)

	# Parse command line arguments
	parser = argparse.ArgumentParser(description="Multimodal Image Description App")
	parser.add_argument("--peft-model", type=str, default="model_V1",
	help="Path to PEFT model")
	parser.add_argument("--port", type=int, default=7860,
	help="Port to run the server on")
	args = parser.parse_args()

	try:
	from multimodal_app import create_interface, load_model

	# Preload the model with PEFT path
	print(f"Preloading model with PEFT path: {args.peft_model}")
	load_model(args.peft_model)

	# Create and launch the interface
	demo = create_interface()

	# Launch with proper settings for stability
	demo.launch(
	share=False, # Set to True if you want a public link
	debug=True, # Enable debug for better error messages
	server_name="0.0.0.0", # Listen on all interfaces
	server_port=args.port, # Port from arguments
	show_api=False # Hide API docs for simplicity
	)
	except Exception as e:
	print(f"Error starting application: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)