""" DimensioDepth - Add Dimension to Everything Advanced AI Depth Estimation with 3D Visualization Powered by Depth-Anything V2 | Runs on Hugging Face Spaces """ import streamlit as st import numpy as np import cv2 from PIL import Image from pathlib import Path import sys # Page config st.set_page_config( page_title="DimensioDepth - AI Depth Estimation", page_icon="🎨", layout="wide" ) # Add backend to path sys.path.append(str(Path(__file__).parent / "backend")) # Import backend utilities from backend.utils.image_processing import ( depth_to_colormap, create_side_by_side ) # Try to import REAL AI model @st.cache_resource def load_model(): try: print("[*] Attempting to import TransformersDepthEstimator...") from backend.utils.transformers_depth import TransformersDepthEstimator print("[*] Import successful! Loading REAL AI Depth-Anything V2 BASE model...") print("[*] This will download ~372MB on first run (one-time download)") depth_estimator = TransformersDepthEstimator(model_size="base") print("[+] REAL AI MODE ACTIVE - BASE MODEL!") print("[+] Quality: SUPERB (best available)") return depth_estimator, True, "BASE (372MB)" except Exception as e: print(f"[!] FULL ERROR TRACEBACK:") import traceback traceback.print_exc() print(f"[!] Error type: {type(e).__name__}") print(f"[!] Error message: {str(e)}") print("[*] Falling back to DEMO MODE") return None, False, "Demo Mode" depth_estimator, USE_REAL_AI, MODEL_SIZE = load_model() def estimate_depth(image): """Estimate depth from an input image using REAL AI or DEMO MODE""" if image is None: return None, None, "Please upload an image first" try: # Convert PIL to numpy if needed if isinstance(image, Image.Image): image = np.array(image) # Generate depth map if USE_REAL_AI: depth = depth_estimator.predict(image) mode_text = "REAL AI (Depth-Anything V2)" else: from backend.utils.demo_depth import generate_smart_depth depth = generate_smart_depth(image) mode_text = "DEMO MODE (Synthetic)" # Create colored depth map with Inferno colormap (best for depth) depth_colored = depth_to_colormap(depth, cv2.COLORMAP_INFERNO) # Create grayscale depth map depth_gray = (depth * 255).astype(np.uint8) depth_gray = cv2.cvtColor(depth_gray, cv2.COLOR_GRAY2RGB) return depth_colored, depth_gray, mode_text, image.shape, depth.shape except Exception as e: st.error(f"Error during depth estimation: {str(e)}") import traceback traceback.print_exc() return None, None, None, None, None # Header st.title("🎨 DimensioDepth - Add Dimension to Everything") st.markdown("### Transform 2D images into stunning 3D depth visualizations") # Status banner if USE_REAL_AI: st.success(f"🚀 REAL AI MODE ACTIVE! - Powered by Depth-Anything V2 {MODEL_SIZE} - SUPERB Quality!") else: st.info("Running in DEMO MODE - Ultra-fast synthetic depth estimation") st.markdown("---") # Main interface col1, col2 = st.columns(2) with col1: st.subheader("Input") uploaded_file = st.file_uploader("Upload Your Image", type=['png', 'jpg', 'jpeg']) process_btn = st.button("🚀 Generate Depth Map", type="primary") with col2: st.subheader("Output") # Processing if uploaded_file is not None and process_btn: # Load image image = Image.open(uploaded_file) with col1: st.image(image, caption="Original Image", use_column_width=True) with st.spinner("Generating depth map..."): depth_colored, depth_gray, mode_text, input_shape, output_shape = estimate_depth(image) if depth_colored is not None: # Store in session state for video export st.session_state['depth_colored'] = depth_colored st.session_state['depth_gray'] = depth_gray st.session_state['original_image'] = np.array(image) with col2: tab1, tab2 = st.tabs(["Colored", "Grayscale"]) with tab1: st.image(depth_colored, caption="Depth Map (Colored)", use_column_width=True) with tab2: st.image(depth_gray, caption="Depth Map (Grayscale)", use_column_width=True) # Info st.success(f"✅ Depth Estimation Complete!") st.info(f""" **Mode**: {mode_text} **Input Size**: {input_shape[1]}x{input_shape[0]} **Output Size**: {output_shape[1]}x{output_shape[0]} {f'**Powered by**: Depth-Anything V2 {MODEL_SIZE}' if USE_REAL_AI else '**Processing**: Ultra-fast (<50ms) synthetic depth'} """) # Video Export Section st.markdown("---") st.subheader("🎬 Video Export") if 'depth_colored' in st.session_state: with st.expander("Export Depth Map as Video", expanded=True): col_vid1, col_vid2 = st.columns(2) with col_vid1: video_duration = st.slider("Duration (seconds)", 1, 30, 10, help="Length of each animation loop") video_fps = st.selectbox("FPS", [24, 30, 60], index=1) video_resolution = st.selectbox("Resolution", [ "Original", "4K UHD (3840x2160)", "1080p (1920x1080)", "720p (1280x720)", "Square 1080p (1080x1080)", "Portrait 1080p (1080x1920)", "Portrait 720p (720x1280)" ], index=2) with col_vid2: video_effect = st.selectbox("Camera Effect", [ "Zoom In", "Zoom Out", "Pan Left", "Pan Right", "Pan Up", "Pan Down", "Rotate CW", "Rotate CCW", "Ken Burns (Zoom + Pan)", "Dolly In", "Dolly Out", "Tilt Up", "Tilt Down", "Orbit" ]) effect_intensity = st.slider("Effect Intensity", 0.1, 3.0, 1.0, 0.1, help="Control how strong the camera movement is (0.5 = subtle, 2.0 = dramatic)") # Additional controls row col_vid3, col_vid4 = st.columns(2) with col_vid3: loop_count = st.slider("Number of Loops", 1, 10, 1, help="How many times to repeat the animation") with col_vid4: video_quality = st.selectbox("Video Quality", [ "High (8 Mbps)", "Medium (5 Mbps)", "Low (3 Mbps)" ], index=0) if st.button("🎬 Export Video", type="primary"): with st.spinner("Generating video..."): try: import cv2 import tempfile # CRITICAL FIX: Use original image instead of depth map for video export! # This ensures we export the real photo with camera effects, not the colored depth visualization original_image = st.session_state['original_image'] # Parse resolution if "4K" in video_resolution: width, height = 3840, 2160 elif "1080p" in video_resolution: if "Portrait" in video_resolution: width, height = 1080, 1920 elif "Square" in video_resolution: width, height = 1080, 1080 else: width, height = 1920, 1080 elif "720p" in video_resolution: if "Portrait" in video_resolution: width, height = 720, 1280 else: width, height = 1280, 720 else: # Original height, width = original_image.shape[:2] # Parse video quality/bitrate if "High" in video_quality: bitrate = 8_000_000 elif "Medium" in video_quality: bitrate = 5_000_000 else: # Low bitrate = 3_000_000 # Resize original image (not depth map!) image_resized = cv2.resize(original_image, (width, height)) # Calculate total frames with loops frames_per_loop = video_duration * video_fps total_frames = frames_per_loop * loop_count with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file: output_path = tmp_file.name fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter(output_path, fourcc, video_fps, (width, height)) for frame_num in range(total_frames): # Calculate progress within current loop (0 to 1) progress = (frame_num % frames_per_loop) / frames_per_loop # Apply effect - NOW USING REAL PHOTO instead of depth map! # Effect intensity multiplier allows user to control how dramatic the movement is if video_effect == "Zoom In": scale = 1.0 + (progress * 0.5 * effect_intensity) center_x, center_y = width // 2, height // 2 new_w, new_h = int(width / scale), int(height / scale) x1, y1 = center_x - new_w // 2, center_y - new_h // 2 x2, y2 = x1 + new_w, y1 + new_h cropped = image_resized[max(0, y1):min(height, y2), max(0, x1):min(width, x2)] frame = cv2.resize(cropped, (width, height)) elif video_effect == "Zoom Out": scale = 1.5 - (progress * 0.5 * effect_intensity) center_x, center_y = width // 2, height // 2 new_w, new_h = int(width / scale), int(height / scale) x1, y1 = center_x - new_w // 2, center_y - new_h // 2 x2, y2 = x1 + new_w, y1 + new_h cropped = image_resized[max(0, y1):min(height, y2), max(0, x1):min(width, x2)] frame = cv2.resize(cropped, (width, height)) elif video_effect == "Ken Burns (Zoom + Pan)": # Ken Burns: zoom in while panning scale = 1.0 + (progress * 0.4 * effect_intensity) pan_x = int(width * progress * 0.2 * effect_intensity) pan_y = int(height * progress * 0.1 * effect_intensity) center_x = width // 2 + pan_x center_y = height // 2 + pan_y new_w, new_h = int(width / scale), int(height / scale) x1, y1 = center_x - new_w // 2, center_y - new_h // 2 x2, y2 = x1 + new_w, y1 + new_h cropped = image_resized[max(0, y1):min(height, y2), max(0, x1):min(width, x2)] frame = cv2.resize(cropped, (width, height)) elif video_effect == "Dolly In": # Dolly in: smooth zoom with slight scale scale = 1.0 + (progress * 0.3 * effect_intensity) center_x, center_y = width // 2, height // 2 new_w, new_h = int(width / scale), int(height / scale) x1, y1 = center_x - new_w // 2, center_y - new_h // 2 x2, y2 = x1 + new_w, y1 + new_h cropped = image_resized[max(0, y1):min(height, y2), max(0, x1):min(width, x2)] frame = cv2.resize(cropped, (width, height)) elif video_effect == "Dolly Out": scale = 1.3 - (progress * 0.3 * effect_intensity) center_x, center_y = width // 2, height // 2 new_w, new_h = int(width / scale), int(height / scale) x1, y1 = center_x - new_w // 2, center_y - new_h // 2 x2, y2 = x1 + new_w, y1 + new_h cropped = image_resized[max(0, y1):min(height, y2), max(0, x1):min(width, x2)] frame = cv2.resize(cropped, (width, height)) elif video_effect == "Pan Left": offset = int(width * progress * 0.3 * effect_intensity) frame = np.roll(image_resized, -offset, axis=1) elif video_effect == "Pan Right": offset = int(width * progress * 0.3 * effect_intensity) frame = np.roll(image_resized, offset, axis=1) elif video_effect == "Pan Up": offset = int(height * progress * 0.3 * effect_intensity) frame = np.roll(image_resized, -offset, axis=0) elif video_effect == "Pan Down": offset = int(height * progress * 0.3 * effect_intensity) frame = np.roll(image_resized, offset, axis=0) elif video_effect == "Tilt Up": # Tilt up: perspective transformation tilt_factor = progress * 0.3 * effect_intensity pts1 = np.float32([[0, 0], [width, 0], [0, height], [width, height]]) pts2 = np.float32([ [0, int(height * tilt_factor)], [width, int(height * tilt_factor)], [0, height], [width, height] ]) matrix = cv2.getPerspectiveTransform(pts1, pts2) frame = cv2.warpPerspective(image_resized, matrix, (width, height)) elif video_effect == "Tilt Down": tilt_factor = progress * 0.3 * effect_intensity pts1 = np.float32([[0, 0], [width, 0], [0, height], [width, height]]) pts2 = np.float32([ [0, 0], [width, 0], [0, height - int(height * tilt_factor)], [width, height - int(height * tilt_factor)] ]) matrix = cv2.getPerspectiveTransform(pts1, pts2) frame = cv2.warpPerspective(image_resized, matrix, (width, height)) elif video_effect == "Rotate CW": angle = progress * 360 * effect_intensity center = (width // 2, height // 2) rotation_matrix = cv2.getRotationMatrix2D(center, -angle, 1.0) frame = cv2.warpAffine(image_resized, rotation_matrix, (width, height)) elif video_effect == "Rotate CCW": angle = progress * 360 * effect_intensity center = (width // 2, height // 2) rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0) frame = cv2.warpAffine(image_resized, rotation_matrix, (width, height)) elif video_effect == "Orbit": # Orbit: rotate + slight zoom angle = progress * 360 * effect_intensity scale = 1.0 + (np.sin(progress * np.pi) * 0.2 * effect_intensity) center = (width // 2, height // 2) rotation_matrix = cv2.getRotationMatrix2D(center, angle, scale) frame = cv2.warpAffine(image_resized, rotation_matrix, (width, height)) else: frame = image_resized.copy() # Convert RGB to BGR for cv2 frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) out.write(frame_bgr) out.release() # Read video and provide download with open(output_path, 'rb') as f: video_bytes = f.read() total_duration = video_duration * loop_count st.success(f"✅ Video generated! {total_frames} frames at {video_fps} FPS ({total_duration}s total, {loop_count} loop{'s' if loop_count > 1 else ''})") st.info(f"📊 Settings: {video_resolution} | {video_quality} | Effect Intensity: {effect_intensity}x") st.download_button( label="📥 Download Video", data=video_bytes, file_name=f"dimensio_{video_effect.lower().replace(' ', '_').replace('(', '').replace(')', '')}_{width}x{height}_{video_fps}fps.mp4", mime="video/mp4" ) except Exception as e: st.error(f"Error generating video: {str(e)}") import traceback traceback.print_exc() else: st.info("👆 Upload an image and generate depth map first to enable video export") # Info section st.markdown("---") st.markdown(""" ## 💡 About DimensioDepth ### Features: - ✅ Real AI depth estimation with Depth-Anything V2 BASE model - ✅ Fast processing (~800ms on CPU, ~200ms on GPU) - ✅ SUPERB quality depth maps - ✅ **Professional video export** with cinematic camera movements - ✅ **Advanced controls** - Effect intensity, loops, quality settings ### Video Export Controls: - ⏱️ **Duration** - 1 to 30 seconds per loop - 🔁 **Loops** - Repeat animation 1-10 times - 🎚️ **Effect Intensity** - Control movement strength (0.1x to 3.0x) - 0.5x = Subtle, professional movements - 1.0x = Default, balanced effects - 2.0x = Dramatic, bold camera work - 📐 **Resolutions** - Original, 4K UHD, 1080p, 720p, Square, Portrait modes - 🎬 **Quality** - High (8 Mbps), Medium (5 Mbps), Low (3 Mbps) - 🎞️ **Frame Rates** - 24fps (cinematic), 30fps (standard), 60fps (smooth) ### Camera Effects: - 📹 **Zoom In/Out** - Smooth zoom controls - 🎬 **Pan** - Left, Right, Up, Down panning - 🎥 **Dolly** - Professional dolly in/out shots - 🎞️ **Tilt** - Up/Down tilt movements - 🔄 **Rotate** - Clockwise/Counter-clockwise rotation - ⭐ **Ken Burns** - Classic zoom + pan effect - 🌀 **Orbit** - Smooth orbital rotation ### Use Cases: - 🎨 **Creative & Artistic**: Depth-enhanced photos, 3D effects - 🎬 **VFX & Film**: Depth map generation for compositing - 🔬 **Research**: Computer vision, depth perception studies - 📱 **Content Creation**: Engaging 3D effects for social media Made with ❤️ for the AI community """)