Spaces:

abhi02072005
/

ai-foley-studio-backend

Sleeping

App Files Files Community

ai-foley-studio-backend / agent.py

abhi02072005

agent upd

86793d6 about 2 months ago

raw

history blame contribute delete

4.48 kB

	from langchain_core.runnables import RunnablePassthrough
	from langchain_core.output_parsers import PydanticOutputParser
	from langchain_core.prompts import ChatPromptTemplate
	from dotenv import load_dotenv
	from custom_wrapper import OpenRouterChat
	from pydantic import BaseModel, Field
	from typing import List
	import os
	import json
	import cv2
	import base64
	from PIL import Image
	import io

	load_dotenv()
	OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")


	class AudioSuggestionOutput(BaseModel):
	audio_suggestions: List[str] = Field(default_factory=list, description="Suggested audio names for footsteps")
	environment_description: str = Field(description="Description of the environment and ground surface")
	reasoning: str = Field(description="Explanation for the audio suggestions")


	llm = OpenRouterChat(
	api_key=OPENROUTER_API_KEY,
	model="meta-llama/llama-3.2-90b-vision-instruct",
	temperature=0.7,
	max_tokens=1024
	)

	parser = PydanticOutputParser(pydantic_object=AudioSuggestionOutput)


	def extract_first_frame(video_path):
	"""Extract the first frame from a video file"""
	try:
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	raise ValueError(f"Cannot open video file: {video_path}")

	success, frame = cap.read()
	cap.release()

	if not success:
	raise ValueError("Cannot read the first frame from video")

	return frame
	except Exception as e:
	print(f"Error extracting first frame: {e}")
	return None


	def image_to_base64(image):
	"""Convert OpenCV image to base64 string"""
	try:
	# Convert BGR to RGB
	image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

	# Convert to PIL Image
	pil_image = Image.fromarray(image_rgb)

	# Convert to base64
	buffered = io.BytesIO()
	pil_image.save(buffered, format="JPEG", quality=85)
	img_str = base64.b64encode(buffered.getvalue()).decode()

	return img_str
	except Exception as e:
	print(f"Error converting image to base64: {e}")
	return None


	prompt = ChatPromptTemplate.from_template("""
	You are an expert sound designer and environmental analyst.
	Analyze the provided image and suggest appropriate audio names for footsteps based on the environment, ground surface, and surroundings.

	Image Data: {image_data}

	Please analyze:
	1. The type of ground/surface (concrete, grass, wood, carpet, gravel, etc.)
	2. The environment (indoor, outdoor, urban, natural, etc.)
	3. Weather conditions if visible (wet, dry, snowy, etc.)
	4. Any other relevant factors that would affect footstep sounds
	5. Audio suggestion's name must be friendly for a youtube search
	6. Name without extensions

	Provide 3-5 specific, descriptive audio file name suggestions for footsteps in this environment.
	The names should be clear, concise, and follow standard audio naming conventions.

	{format_instructions}
	""")

	chain = (
	{"image_data": RunnablePassthrough(), "format_instructions": lambda x: parser.get_format_instructions()}
	\| prompt
	\| llm
	\| parser
	)


	def analyze_image_and_suggest_audio(image_base64):
	"""Analyze the image and suggest audio names for footsteps"""
	try:
	result = chain.invoke(image_base64)
	return result.dict()
	except Exception as e:
	print("Error during image analysis:", e)
	return None


	def process_video_for_footstep_audio(video_path):
	# Extract first frame from video
	print("Extracting first frame from video...")
	first_frame = extract_first_frame(video_path)

	if first_frame is None:
	return {"error": "Failed to extract first frame from video"}

	# Convert image to base64
	print("Converting image to base64...")
	image_base64 = image_to_base64(first_frame)

	if image_base64 is None:
	return {"error": "Failed to convert image to base64"}

	# Analyze image and get audio suggestions
	print("Analyzing image and generating audio suggestions...")
	result = analyze_image_and_suggest_audio(image_base64)

	# Save results
	if result:
	output_file = "./gemini2.json"
	os.makedirs(os.path.dirname(output_file), exist_ok=True)

	with open(output_file, "w") as f:
	json.dump(result, f, indent=2)

	print(f"Results saved to {output_file}")

	print("Audio Suggestions:", result['audio_suggestions'])
	return result['audio_suggestions'][0]