Spaces:

Javedalam
/

gen-k8x8mz2l

Runtime error

File size: 14,327 Bytes

568817c

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
import numpy as np
import tempfile
import os
import spaces

# Global variables for models
model = None
tokenizer = None
snac_model = None

def load_models():
    """Load models on first use"""
    global model, tokenizer, snac_model
    
    if model is None:
        print("Loading Maya1 model...")
        model = AutoModelForCausalLM.from_pretrained(
            "maya-research/maya1", 
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True
        )
        tokenizer = AutoTokenizer.from_pretrained(
            "maya-research/maya1",
            trust_remote_code=True
        )
        # Ensure pad token is set
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        print(f"Tokenizer vocab size: {len(tokenizer)}")
        print(f"Model vocab size: {model.config.vocab_size}")
        print(f"EOS token ID: {tokenizer.eos_token_id}")
        print(f"PAD token ID: {tokenizer.pad_token_id}")
    
    if snac_model is None:
        print("Loading SNAC audio decoder...")
        snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()

@spaces.GPU(duration=180)
def generate_speech(text, voice_description, temperature, top_p, max_tokens):
    """Generate speech from text using Maya1 model with ZeroGPU"""
    
    if not text.strip():
        raise gr.Error("Please enter some text to convert to speech!")
    
    if not voice_description.strip():
        voice_description = "Realistic voice with neutral tone and conversational pacing."
    
    try:
        # Load models if not already loaded
        load_models()
        
        # Move models to GPU
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)
        snac_model.to(device)
        
        # Create prompt - exactly as shown in docs
        prompt = f'<description="{voice_description}"> {text}'
        
        print(f"Prompt: {prompt}")
        
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        
        print(f"Input IDs shape: {inputs['input_ids'].shape}")
        print(f"Input length: {inputs['input_ids'].shape[1]}")
        print(f"First 20 input tokens: {inputs['input_ids'][0][:20].tolist()}")
        
        # Generate with settings from the documentation
        # Don't pass eos_token_id to allow generation to continue
        with torch.inference_mode():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs.get('attention_mask', None),
                max_new_tokens=int(max_tokens), 
                temperature=float(temperature), 
                top_p=float(top_p), 
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                # Don't set eos_token_id - let model decide when to stop
                # or set it to a value outside SNAC range
                eos_token_id=None,
                repetition_penalty=1.1,
            )
        
        print(f"Output shape: {outputs.shape}")
        print(f"Total output length: {outputs.shape[1]}")
        
        # Extract SNAC audio tokens
        generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
        
        print(f"Generated {len(generated_ids)} new tokens")
        print(f"First 50 generated IDs: {generated_ids[:50].tolist()}")
        
        # Filter for SNAC tokens in the correct range
        snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
        
        print(f"Total SNAC tokens: {len(snac_tokens)}")
        
        if len(snac_tokens) < 7:
            # Show all generated token IDs for debugging
            all_tokens = generated_ids.tolist()
            unique_tokens = sorted(list(set(all_tokens)))
            print(f"All unique token IDs ({len(unique_tokens)}): {unique_tokens[:100]}")
            
            # Check if any tokens are in expected range
            in_range = [t for t in all_tokens if 128266 <= t <= 156937]
            print(f"Tokens in SNAC range: {len(in_range)}")
            
            raise gr.Error(
                f"Model generated only {len(generated_ids)} tokens, with {len(snac_tokens)} SNAC audio tokens. "
                f"Token range: {min(all_tokens) if all_tokens else 'N/A'}-{max(all_tokens) if all_tokens else 'N/A'}. "
                f"Expected SNAC range: 128266-156937. This may indicate a model configuration issue. "
                f"Try: 1) Longer input text, 2) Increase max_tokens to 1500, 3) Different temperature (0.6-0.8)"
            )
        
        # Decode SNAC tokens to audio frames
        frames = len(snac_tokens) // 7
        
        print(f"Audio frames: {frames}")
        
        codes = [[], [], []]
        
        for i in range(frames):
            s = snac_tokens[i*7:(i+1)*7]
            codes[0].append((s[0]-128266) % 4096)
            codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
            codes[2].extend([
                (s[2]-128266) % 4096, 
                (s[3]-128266) % 4096, 
                (s[5]-128266) % 4096, 
                (s[6]-128266) % 4096
            ])
        
        # Generate final audio with SNAC decoder
        codes_tensor = [
            torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0) 
            for c in codes
        ]
        
        with torch.inference_mode():
            audio = snac_model.decoder(
                snac_model.quantizer.from_codes(codes_tensor)
            )[0, 0].cpu().numpy()
        
        print(f"Audio shape: {audio.shape}, Duration: {len(audio)/24000:.2f}s")
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
            sf.write(f.name, audio, 24000)
            return f.name
            
    except Exception as e:
        import traceback
        traceback.print_exc()
        raise gr.Error(f"Error generating speech: {str(e)}")

# Predefined voice presets
voice_presets = {
    "Male - American": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
    "Female - British": "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
    "Male - Deep": "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
    "Female - Energetic": "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
    "Neutral - Professional": "Professional neutral voice with clear articulation. Balanced pitch, crisp tone, measured pacing.",
    "Male - Warm": "Warm male voice with friendly tone. Medium pitch, smooth timbre, relaxed pacing.",
    "Custom": ""
}

def update_voice_description(preset):
    """Update voice description based on preset selection"""
    return voice_presets.get(preset, "")

# Example texts with emotions - increased max_tokens
examples = [
    [
        "Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
        "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
        0.7,
        0.9,
        1000
    ],
    [
        "I'm so excited to share this amazing news with you! This is incredible and wonderful!",
        "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
        0.8,
        0.9,
        1000
    ],
    [
        "In a world of constant change, one thing remains certain: the power of human connection and understanding.",
        "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
        0.6,
        0.85,
        1000
    ],
    [
        "The gentle breeze whispered through the trees as the sun set over the horizon in beautiful colors.",
        "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
        0.7,
        0.9,
        1000
    ]
]

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo:
    gr.HTML("""
        <div style="text-align: center; padding: 20px;">
            <h1>🎙️ Maya1 Text-to-Speech</h1>
            <p style="font-size: 18px; color: #666;">
                Generate emotional and realistic speech with natural language voice design
            </p>
            <p style="font-size: 14px; margin-top: 10px;">
                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;">anycoder</a>
            </p>
            <p style="font-size: 12px; color: #28a745; margin-top: 5px;">
                ⚡ Powered by ZeroGPU for efficient inference
            </p>
        </div>
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📝 Input")
            
            text_input = gr.Textbox(
                label="Text to Speak",
                placeholder="Enter your text here... You can use <laugh>, <sigh>, and other emotion tags!",
                lines=5,
                max_lines=10,
                value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions."
            )
            
            gr.Markdown("### 🎨 Voice Design")
            
            voice_preset = gr.Dropdown(
                choices=list(voice_presets.keys()),
                label="Voice Preset",
                value="Male - American",
                info="Select a preset or choose 'Custom' to write your own"
            )
            
            voice_description = gr.Textbox(
                label="Voice Description",
                placeholder="Describe the voice characteristics...",
                lines=3,
                value=voice_presets["Male - American"],
                info="Describe age, gender, accent, pitch, timbre, and pacing"
            )
            
            with gr.Accordion("⚙️ Advanced Settings", open=False):
                temperature = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.7,
                    step=0.1,
                    label="Temperature",
                    info="Controls randomness (higher may help with generation)"
                )
                
                top_p = gr.Slider(
                    minimum=0.5,
                    maximum=1.0,
                    value=0.9,
                    step=0.05,
                    label="Top P",
                    info="Nucleus sampling threshold"
                )
                
                max_tokens = gr.Slider(
                    minimum=500,
                    maximum=2000,
                    value=1000,
                    step=100,
                    label="Max Tokens",
                    info="Maximum length of generated audio (higher = longer audio)"
                )
            
            generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            gr.Markdown("### 🔊 Output")
            
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath",
                interactive=False
            )
            
            gr.Markdown("""
            ### 💡 Tips
            - **Use longer sentences** (20+ words recommended)
            - Start with **temperature=0.7** and **max_tokens=1000**
            - Use emotion tags like `<laugh>`, `<sigh>`, `<whisper>` in your text
            - Experiment with different voice descriptions
            - GPU allocation: 180 seconds per generation
            
            ### 🎭 Emotion Tags
            You can use various emotion tags in your text:
            - `<laugh>` - Laughter
            - `<sigh>` - Sighing
            - `<whisper>` - Whispering
            - `<shout>` - Shouting
            
            ### ⚙️ Troubleshooting
            If generation fails with "not enough tokens":
            1. **Increase temperature** to 0.7-0.8
            2. **Use longer input text** (full sentences)
            3. **Increase max_tokens** to 1500-2000
            4. Try different voice descriptions
            
            ### 🔍 Known Issue
            This model may have specific requirements for the Hugging Face Spaces environment.
            If issues persist, the model may need additional configuration or dependencies.
            """)
    
    # Update voice description when preset changes
    voice_preset.change(
        fn=update_voice_description,
        inputs=[voice_preset],
        outputs=[voice_description]
    )
    
    # Generate speech button
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_description, temperature, top_p, max_tokens],
        outputs=[audio_output]
    )
    
    # Examples section
    gr.Markdown("### 📚 Examples")
    gr.Examples(
        examples=examples,
        inputs=[text_input, voice_description, temperature, top_p, max_tokens],
        outputs=[audio_output],
        fn=generate_speech,
        cache_examples=False
    )
    
    gr.Markdown("""
    ---
    ### About Maya1
    Maya1 is a state-of-the-art open-source voice AI model that generates realistic, emotional speech from text. 
    It uses natural language descriptions to design unique voices and supports emotional expressions through special tags.
    
    **Model:** [maya-research/maya1](https://huggingface.co/maya-research/maya1)
    
    ### ZeroGPU Integration
    This Space uses ZeroGPU for efficient GPU allocation. The GPU is only used during inference, 
    allowing for cost-effective hosting while maintaining excellent performance.
    
    ### Important Note
    This model requires specific setup and may have compatibility requirements. If you encounter persistent
    issues, please check the [model card](https://huggingface.co/maya-research/maya1) for the latest information.
    """)

if __name__ == "__main__":
    demo.launch()