File size: 4,477 Bytes
80aa632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe30934
80aa632
 
 
 
 
 
 
86793d6
80aa632
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from custom_wrapper import OpenRouterChat
from pydantic import BaseModel, Field
from typing import List
import os
import json
import cv2
import base64
from PIL import Image
import io

load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")


class AudioSuggestionOutput(BaseModel):
    audio_suggestions: List[str] = Field(default_factory=list, description="Suggested audio names for footsteps")
    environment_description: str = Field(description="Description of the environment and ground surface")
    reasoning: str = Field(description="Explanation for the audio suggestions")


llm = OpenRouterChat(
    api_key=OPENROUTER_API_KEY,
    model="meta-llama/llama-3.2-90b-vision-instruct",
    temperature=0.7,
    max_tokens=1024
)

parser = PydanticOutputParser(pydantic_object=AudioSuggestionOutput)


def extract_first_frame(video_path):
    """Extract the first frame from a video file"""
    try:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Cannot open video file: {video_path}")

        success, frame = cap.read()
        cap.release()

        if not success:
            raise ValueError("Cannot read the first frame from video")

        return frame
    except Exception as e:
        print(f"Error extracting first frame: {e}")
        return None


def image_to_base64(image):
    """Convert OpenCV image to base64 string"""
    try:
        # Convert BGR to RGB
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Convert to PIL Image
        pil_image = Image.fromarray(image_rgb)

        # Convert to base64
        buffered = io.BytesIO()
        pil_image.save(buffered, format="JPEG", quality=85)
        img_str = base64.b64encode(buffered.getvalue()).decode()

        return img_str
    except Exception as e:
        print(f"Error converting image to base64: {e}")
        return None


prompt = ChatPromptTemplate.from_template("""
You are an expert sound designer and environmental analyst. 
Analyze the provided image and suggest appropriate audio names for footsteps based on the environment, ground surface, and surroundings.

Image Data: {image_data}

Please analyze:
1. The type of ground/surface (concrete, grass, wood, carpet, gravel, etc.)
2. The environment (indoor, outdoor, urban, natural, etc.)
3. Weather conditions if visible (wet, dry, snowy, etc.)
4. Any other relevant factors that would affect footstep sounds
5. Audio suggestion's name must be friendly for a youtube search
6. Name without extensions

Provide 3-5 specific, descriptive audio file name suggestions for footsteps in this environment.
The names should be clear, concise, and follow standard audio naming conventions.

{format_instructions}
""")

chain = (
        {"image_data": RunnablePassthrough(), "format_instructions": lambda x: parser.get_format_instructions()}
        | prompt
        | llm
        | parser
)


def analyze_image_and_suggest_audio(image_base64):
    """Analyze the image and suggest audio names for footsteps"""
    try:
        result = chain.invoke(image_base64)
        return result.dict()
    except Exception as e:
        print("Error during image analysis:", e)
        return None


def process_video_for_footstep_audio(video_path):
    # Extract first frame from video
    print("Extracting first frame from video...")
    first_frame = extract_first_frame(video_path)

    if first_frame is None:
        return {"error": "Failed to extract first frame from video"}

    # Convert image to base64
    print("Converting image to base64...")
    image_base64 = image_to_base64(first_frame)

    if image_base64 is None:
        return {"error": "Failed to convert image to base64"}

    # Analyze image and get audio suggestions
    print("Analyzing image and generating audio suggestions...")
    result = analyze_image_and_suggest_audio(image_base64)

    # Save results
    if result:
        output_file = "./gemini2.json"
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        with open(output_file, "w") as f:
            json.dump(result, f, indent=2)

        print(f"Results saved to {output_file}")

    print("Audio Suggestions:", result['audio_suggestions'])
    return result['audio_suggestions'][0]