Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from snac import SNAC | |
| import soundfile as sf | |
| import numpy as np | |
| import tempfile | |
| import os | |
| import spaces | |
| # Global variables for models | |
| model = None | |
| tokenizer = None | |
| snac_model = None | |
| def load_models(): | |
| """Load models on first use""" | |
| global model, tokenizer, snac_model | |
| if model is None: | |
| print("Loading Maya1 model...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "maya-research/maya1", | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "maya-research/maya1", | |
| trust_remote_code=True | |
| ) | |
| # Ensure pad token is set | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| print(f"Tokenizer vocab size: {len(tokenizer)}") | |
| print(f"Model vocab size: {model.config.vocab_size}") | |
| print(f"EOS token ID: {tokenizer.eos_token_id}") | |
| print(f"PAD token ID: {tokenizer.pad_token_id}") | |
| if snac_model is None: | |
| print("Loading SNAC audio decoder...") | |
| snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval() | |
| def generate_speech(text, voice_description, temperature, top_p, max_tokens): | |
| """Generate speech from text using Maya1 model with ZeroGPU""" | |
| if not text.strip(): | |
| raise gr.Error("Please enter some text to convert to speech!") | |
| if not voice_description.strip(): | |
| voice_description = "Realistic voice with neutral tone and conversational pacing." | |
| try: | |
| # Load models if not already loaded | |
| load_models() | |
| # Move models to GPU | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| snac_model.to(device) | |
| # Create prompt - exactly as shown in docs | |
| prompt = f'<description="{voice_description}"> {text}' | |
| print(f"Prompt: {prompt}") | |
| # Tokenize input | |
| inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| print(f"Input IDs shape: {inputs['input_ids'].shape}") | |
| print(f"Input length: {inputs['input_ids'].shape[1]}") | |
| print(f"First 20 input tokens: {inputs['input_ids'][0][:20].tolist()}") | |
| # Generate with settings from the documentation | |
| # Don't pass eos_token_id to allow generation to continue | |
| with torch.inference_mode(): | |
| outputs = model.generate( | |
| input_ids=inputs['input_ids'], | |
| attention_mask=inputs.get('attention_mask', None), | |
| max_new_tokens=int(max_tokens), | |
| temperature=float(temperature), | |
| top_p=float(top_p), | |
| do_sample=True, | |
| pad_token_id=tokenizer.pad_token_id, | |
| # Don't set eos_token_id - let model decide when to stop | |
| # or set it to a value outside SNAC range | |
| eos_token_id=None, | |
| repetition_penalty=1.1, | |
| ) | |
| print(f"Output shape: {outputs.shape}") | |
| print(f"Total output length: {outputs.shape[1]}") | |
| # Extract SNAC audio tokens | |
| generated_ids = outputs[0, inputs['input_ids'].shape[1]:] | |
| print(f"Generated {len(generated_ids)} new tokens") | |
| print(f"First 50 generated IDs: {generated_ids[:50].tolist()}") | |
| # Filter for SNAC tokens in the correct range | |
| snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937] | |
| print(f"Total SNAC tokens: {len(snac_tokens)}") | |
| if len(snac_tokens) < 7: | |
| # Show all generated token IDs for debugging | |
| all_tokens = generated_ids.tolist() | |
| unique_tokens = sorted(list(set(all_tokens))) | |
| print(f"All unique token IDs ({len(unique_tokens)}): {unique_tokens[:100]}") | |
| # Check if any tokens are in expected range | |
| in_range = [t for t in all_tokens if 128266 <= t <= 156937] | |
| print(f"Tokens in SNAC range: {len(in_range)}") | |
| raise gr.Error( | |
| f"Model generated only {len(generated_ids)} tokens, with {len(snac_tokens)} SNAC audio tokens. " | |
| f"Token range: {min(all_tokens) if all_tokens else 'N/A'}-{max(all_tokens) if all_tokens else 'N/A'}. " | |
| f"Expected SNAC range: 128266-156937. This may indicate a model configuration issue. " | |
| f"Try: 1) Longer input text, 2) Increase max_tokens to 1500, 3) Different temperature (0.6-0.8)" | |
| ) | |
| # Decode SNAC tokens to audio frames | |
| frames = len(snac_tokens) // 7 | |
| print(f"Audio frames: {frames}") | |
| codes = [[], [], []] | |
| for i in range(frames): | |
| s = snac_tokens[i*7:(i+1)*7] | |
| codes[0].append((s[0]-128266) % 4096) | |
| codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096]) | |
| codes[2].extend([ | |
| (s[2]-128266) % 4096, | |
| (s[3]-128266) % 4096, | |
| (s[5]-128266) % 4096, | |
| (s[6]-128266) % 4096 | |
| ]) | |
| # Generate final audio with SNAC decoder | |
| codes_tensor = [ | |
| torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0) | |
| for c in codes | |
| ] | |
| with torch.inference_mode(): | |
| audio = snac_model.decoder( | |
| snac_model.quantizer.from_codes(codes_tensor) | |
| )[0, 0].cpu().numpy() | |
| print(f"Audio shape: {audio.shape}, Duration: {len(audio)/24000:.2f}s") | |
| # Save to temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: | |
| sf.write(f.name, audio, 24000) | |
| return f.name | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| raise gr.Error(f"Error generating speech: {str(e)}") | |
| # Predefined voice presets | |
| voice_presets = { | |
| "Male - American": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.", | |
| "Female - British": "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.", | |
| "Male - Deep": "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.", | |
| "Female - Energetic": "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.", | |
| "Neutral - Professional": "Professional neutral voice with clear articulation. Balanced pitch, crisp tone, measured pacing.", | |
| "Male - Warm": "Warm male voice with friendly tone. Medium pitch, smooth timbre, relaxed pacing.", | |
| "Custom": "" | |
| } | |
| def update_voice_description(preset): | |
| """Update voice description based on preset selection""" | |
| return voice_presets.get(preset, "") | |
| # Example texts with emotions - increased max_tokens | |
| examples = [ | |
| [ | |
| "Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.", | |
| "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.", | |
| 0.7, | |
| 0.9, | |
| 1000 | |
| ], | |
| [ | |
| "I'm so excited to share this amazing news with you! This is incredible and wonderful!", | |
| "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.", | |
| 0.8, | |
| 0.9, | |
| 1000 | |
| ], | |
| [ | |
| "In a world of constant change, one thing remains certain: the power of human connection and understanding.", | |
| "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.", | |
| 0.6, | |
| 0.85, | |
| 1000 | |
| ], | |
| [ | |
| "The gentle breeze whispered through the trees as the sun set over the horizon in beautiful colors.", | |
| "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.", | |
| 0.7, | |
| 0.9, | |
| 1000 | |
| ] | |
| ] | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px;"> | |
| <h1>ποΈ Maya1 Text-to-Speech</h1> | |
| <p style="font-size: 18px; color: #666;"> | |
| Generate emotional and realistic speech with natural language voice design | |
| </p> | |
| <p style="font-size: 14px; margin-top: 10px;"> | |
| Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;">anycoder</a> | |
| </p> | |
| <p style="font-size: 12px; color: #28a745; margin-top: 5px;"> | |
| β‘ Powered by ZeroGPU for efficient inference | |
| </p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Input") | |
| text_input = gr.Textbox( | |
| label="Text to Speak", | |
| placeholder="Enter your text here... You can use <laugh>, <sigh>, and other emotion tags!", | |
| lines=5, | |
| max_lines=10, | |
| value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions." | |
| ) | |
| gr.Markdown("### π¨ Voice Design") | |
| voice_preset = gr.Dropdown( | |
| choices=list(voice_presets.keys()), | |
| label="Voice Preset", | |
| value="Male - American", | |
| info="Select a preset or choose 'Custom' to write your own" | |
| ) | |
| voice_description = gr.Textbox( | |
| label="Voice Description", | |
| placeholder="Describe the voice characteristics...", | |
| lines=3, | |
| value=voice_presets["Male - American"], | |
| info="Describe age, gender, accent, pitch, timbre, and pacing" | |
| ) | |
| with gr.Accordion("βοΈ Advanced Settings", open=False): | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| info="Controls randomness (higher may help with generation)" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.5, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.05, | |
| label="Top P", | |
| info="Nucleus sampling threshold" | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=500, | |
| maximum=2000, | |
| value=1000, | |
| step=100, | |
| label="Max Tokens", | |
| info="Maximum length of generated audio (higher = longer audio)" | |
| ) | |
| generate_btn = gr.Button("π€ Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Output") | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="filepath", | |
| interactive=False | |
| ) | |
| gr.Markdown(""" | |
| ### π‘ Tips | |
| - **Use longer sentences** (20+ words recommended) | |
| - Start with **temperature=0.7** and **max_tokens=1000** | |
| - Use emotion tags like `<laugh>`, `<sigh>`, `<whisper>` in your text | |
| - Experiment with different voice descriptions | |
| - GPU allocation: 180 seconds per generation | |
| ### π Emotion Tags | |
| You can use various emotion tags in your text: | |
| - `<laugh>` - Laughter | |
| - `<sigh>` - Sighing | |
| - `<whisper>` - Whispering | |
| - `<shout>` - Shouting | |
| ### βοΈ Troubleshooting | |
| If generation fails with "not enough tokens": | |
| 1. **Increase temperature** to 0.7-0.8 | |
| 2. **Use longer input text** (full sentences) | |
| 3. **Increase max_tokens** to 1500-2000 | |
| 4. Try different voice descriptions | |
| ### π Known Issue | |
| This model may have specific requirements for the Hugging Face Spaces environment. | |
| If issues persist, the model may need additional configuration or dependencies. | |
| """) | |
| # Update voice description when preset changes | |
| voice_preset.change( | |
| fn=update_voice_description, | |
| inputs=[voice_preset], | |
| outputs=[voice_description] | |
| ) | |
| # Generate speech button | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_description, temperature, top_p, max_tokens], | |
| outputs=[audio_output] | |
| ) | |
| # Examples section | |
| gr.Markdown("### π Examples") | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[text_input, voice_description, temperature, top_p, max_tokens], | |
| outputs=[audio_output], | |
| fn=generate_speech, | |
| cache_examples=False | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### About Maya1 | |
| Maya1 is a state-of-the-art open-source voice AI model that generates realistic, emotional speech from text. | |
| It uses natural language descriptions to design unique voices and supports emotional expressions through special tags. | |
| **Model:** [maya-research/maya1](https://huggingface.co/maya-research/maya1) | |
| ### ZeroGPU Integration | |
| This Space uses ZeroGPU for efficient GPU allocation. The GPU is only used during inference, | |
| allowing for cost-effective hosting while maintaining excellent performance. | |
| ### Important Note | |
| This model requires specific setup and may have compatibility requirements. If you encounter persistent | |
| issues, please check the [model card](https://huggingface.co/maya-research/maya1) for the latest information. | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |