Spaces:

abhi02072005
/

ai-foley-studio-backend

Sleeping

File size: 4,477 Bytes

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from custom_wrapper import OpenRouterChat
from pydantic import BaseModel, Field
from typing import List
import os
import json
import cv2
import base64
from PIL import Image
import io

load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")


class AudioSuggestionOutput(BaseModel):
    audio_suggestions: List[str] = Field(default_factory=list, description="Suggested audio names for footsteps")
    environment_description: str = Field(description="Description of the environment and ground surface")
    reasoning: str = Field(description="Explanation for the audio suggestions")


llm = OpenRouterChat(
    api_key=OPENROUTER_API_KEY,
    model="meta-llama/llama-3.2-90b-vision-instruct",
    temperature=0.7,
    max_tokens=1024
)

parser = PydanticOutputParser(pydantic_object=AudioSuggestionOutput)


def extract_first_frame(video_path):
    """Extract the first frame from a video file"""
    try:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Cannot open video file: {video_path}")

        success, frame = cap.read()
        cap.release()

        if not success:
            raise ValueError("Cannot read the first frame from video")

        return frame
    except Exception as e:
        print(f"Error extracting first frame: {e}")
        return None


def image_to_base64(image):
    """Convert OpenCV image to base64 string"""
    try:
        # Convert BGR to RGB
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Convert to PIL Image
        pil_image = Image.fromarray(image_rgb)

        # Convert to base64
        buffered = io.BytesIO()
        pil_image.save(buffered, format="JPEG", quality=85)
        img_str = base64.b64encode(buffered.getvalue()).decode()

        return img_str
    except Exception as e:
        print(f"Error converting image to base64: {e}")
        return None


prompt = ChatPromptTemplate.from_template("""
You are an expert sound designer and environmental analyst. 
Analyze the provided image and suggest appropriate audio names for footsteps based on the environment, ground surface, and surroundings.

Image Data: {image_data}

Please analyze:
1. The type of ground/surface (concrete, grass, wood, carpet, gravel, etc.)
2. The environment (indoor, outdoor, urban, natural, etc.)
3. Weather conditions if visible (wet, dry, snowy, etc.)
4. Any other relevant factors that would affect footstep sounds
5. Audio suggestion's name must be friendly for a youtube search
6. Name without extensions

Provide 3-5 specific, descriptive audio file name suggestions for footsteps in this environment.
The names should be clear, concise, and follow standard audio naming conventions.

{format_instructions}
""")

chain = (
        {"image_data": RunnablePassthrough(), "format_instructions": lambda x: parser.get_format_instructions()}
        | prompt
        | llm
        | parser
)


def analyze_image_and_suggest_audio(image_base64):
    """Analyze the image and suggest audio names for footsteps"""
    try:
        result = chain.invoke(image_base64)
        return result.dict()
    except Exception as e:
        print("Error during image analysis:", e)
        return None


def process_video_for_footstep_audio(video_path):
    # Extract first frame from video
    print("Extracting first frame from video...")
    first_frame = extract_first_frame(video_path)

    if first_frame is None:
        return {"error": "Failed to extract first frame from video"}

    # Convert image to base64
    print("Converting image to base64...")
    image_base64 = image_to_base64(first_frame)

    if image_base64 is None:
        return {"error": "Failed to convert image to base64"}

    # Analyze image and get audio suggestions
    print("Analyzing image and generating audio suggestions...")
    result = analyze_image_and_suggest_audio(image_base64)

    # Save results
    if result:
        output_file = "./gemini2.json"
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        with open(output_file, "w") as f:
            json.dump(result, f, indent=2)

        print(f"Results saved to {output_file}")

    print("Audio Suggestions:", result['audio_suggestions'])
    return result['audio_suggestions'][0]