import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from snac import SNAC import soundfile as sf import numpy as np import tempfile import os import spaces # Global variables for models model = None tokenizer = None snac_model = None def load_models(): """Load models on first use""" global model, tokenizer, snac_model if model is None: print("Loading Maya1 model...") model = AutoModelForCausalLM.from_pretrained( "maya-research/maya1", torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( "maya-research/maya1", trust_remote_code=True ) # Ensure pad token is set if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print(f"Tokenizer vocab size: {len(tokenizer)}") print(f"Model vocab size: {model.config.vocab_size}") print(f"EOS token ID: {tokenizer.eos_token_id}") print(f"PAD token ID: {tokenizer.pad_token_id}") if snac_model is None: print("Loading SNAC audio decoder...") snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval() @spaces.GPU(duration=180) def generate_speech(text, voice_description, temperature, top_p, max_tokens): """Generate speech from text using Maya1 model with ZeroGPU""" if not text.strip(): raise gr.Error("Please enter some text to convert to speech!") if not voice_description.strip(): voice_description = "Realistic voice with neutral tone and conversational pacing." try: # Load models if not already loaded load_models() # Move models to GPU device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) snac_model.to(device) # Create prompt - exactly as shown in docs prompt = f' {text}' print(f"Prompt: {prompt}") # Tokenize input inputs = tokenizer(prompt, return_tensors="pt").to(device) print(f"Input IDs shape: {inputs['input_ids'].shape}") print(f"Input length: {inputs['input_ids'].shape[1]}") print(f"First 20 input tokens: {inputs['input_ids'][0][:20].tolist()}") # Generate with settings from the documentation # Don't pass eos_token_id to allow generation to continue with torch.inference_mode(): outputs = model.generate( input_ids=inputs['input_ids'], attention_mask=inputs.get('attention_mask', None), max_new_tokens=int(max_tokens), temperature=float(temperature), top_p=float(top_p), do_sample=True, pad_token_id=tokenizer.pad_token_id, # Don't set eos_token_id - let model decide when to stop # or set it to a value outside SNAC range eos_token_id=None, repetition_penalty=1.1, ) print(f"Output shape: {outputs.shape}") print(f"Total output length: {outputs.shape[1]}") # Extract SNAC audio tokens generated_ids = outputs[0, inputs['input_ids'].shape[1]:] print(f"Generated {len(generated_ids)} new tokens") print(f"First 50 generated IDs: {generated_ids[:50].tolist()}") # Filter for SNAC tokens in the correct range snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937] print(f"Total SNAC tokens: {len(snac_tokens)}") if len(snac_tokens) < 7: # Show all generated token IDs for debugging all_tokens = generated_ids.tolist() unique_tokens = sorted(list(set(all_tokens))) print(f"All unique token IDs ({len(unique_tokens)}): {unique_tokens[:100]}") # Check if any tokens are in expected range in_range = [t for t in all_tokens if 128266 <= t <= 156937] print(f"Tokens in SNAC range: {len(in_range)}") raise gr.Error( f"Model generated only {len(generated_ids)} tokens, with {len(snac_tokens)} SNAC audio tokens. " f"Token range: {min(all_tokens) if all_tokens else 'N/A'}-{max(all_tokens) if all_tokens else 'N/A'}. " f"Expected SNAC range: 128266-156937. This may indicate a model configuration issue. " f"Try: 1) Longer input text, 2) Increase max_tokens to 1500, 3) Different temperature (0.6-0.8)" ) # Decode SNAC tokens to audio frames frames = len(snac_tokens) // 7 print(f"Audio frames: {frames}") codes = [[], [], []] for i in range(frames): s = snac_tokens[i*7:(i+1)*7] codes[0].append((s[0]-128266) % 4096) codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096]) codes[2].extend([ (s[2]-128266) % 4096, (s[3]-128266) % 4096, (s[5]-128266) % 4096, (s[6]-128266) % 4096 ]) # Generate final audio with SNAC decoder codes_tensor = [ torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0) for c in codes ] with torch.inference_mode(): audio = snac_model.decoder( snac_model.quantizer.from_codes(codes_tensor) )[0, 0].cpu().numpy() print(f"Audio shape: {audio.shape}, Duration: {len(audio)/24000:.2f}s") # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: sf.write(f.name, audio, 24000) return f.name except Exception as e: import traceback traceback.print_exc() raise gr.Error(f"Error generating speech: {str(e)}") # Predefined voice presets voice_presets = { "Male - American": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.", "Female - British": "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.", "Male - Deep": "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.", "Female - Energetic": "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.", "Neutral - Professional": "Professional neutral voice with clear articulation. Balanced pitch, crisp tone, measured pacing.", "Male - Warm": "Warm male voice with friendly tone. Medium pitch, smooth timbre, relaxed pacing.", "Custom": "" } def update_voice_description(preset): """Update voice description based on preset selection""" return voice_presets.get(preset, "") # Example texts with emotions - increased max_tokens examples = [ [ "Hello! This is Maya1 the best open source voice AI model with emotions.", "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.", 0.7, 0.9, 1000 ], [ "I'm so excited to share this amazing news with you! This is incredible and wonderful!", "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.", 0.8, 0.9, 1000 ], [ "In a world of constant change, one thing remains certain: the power of human connection and understanding.", "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.", 0.6, 0.85, 1000 ], [ "The gentle breeze whispered through the trees as the sun set over the horizon in beautiful colors.", "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.", 0.7, 0.9, 1000 ] ] # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo: gr.HTML("""

🎙️ Maya1 Text-to-Speech

Generate emotional and realistic speech with natural language voice design

Built with anycoder

⚡ Powered by ZeroGPU for efficient inference

""") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 📝 Input") text_input = gr.Textbox( label="Text to Speak", placeholder="Enter your text here... You can use , , and other emotion tags!", lines=5, max_lines=10, value="Hello! This is Maya1 the best open source voice AI model with emotions." ) gr.Markdown("### 🎨 Voice Design") voice_preset = gr.Dropdown( choices=list(voice_presets.keys()), label="Voice Preset", value="Male - American", info="Select a preset or choose 'Custom' to write your own" ) voice_description = gr.Textbox( label="Voice Description", placeholder="Describe the voice characteristics...", lines=3, value=voice_presets["Male - American"], info="Describe age, gender, accent, pitch, timbre, and pacing" ) with gr.Accordion("⚙️ Advanced Settings", open=False): temperature = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature", info="Controls randomness (higher may help with generation)" ) top_p = gr.Slider( minimum=0.5, maximum=1.0, value=0.9, step=0.05, label="Top P", info="Nucleus sampling threshold" ) max_tokens = gr.Slider( minimum=500, maximum=2000, value=1000, step=100, label="Max Tokens", info="Maximum length of generated audio (higher = longer audio)" ) generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### 🔊 Output") audio_output = gr.Audio( label="Generated Speech", type="filepath", interactive=False ) gr.Markdown(""" ### 💡 Tips - **Use longer sentences** (20+ words recommended) - Start with **temperature=0.7** and **max_tokens=1000** - Use emotion tags like ``, ``, `` in your text - Experiment with different voice descriptions - GPU allocation: 180 seconds per generation ### 🎭 Emotion Tags You can use various emotion tags in your text: - `` - Laughter - `` - Sighing - `` - Whispering - `` - Shouting ### ⚙️ Troubleshooting If generation fails with "not enough tokens": 1. **Increase temperature** to 0.7-0.8 2. **Use longer input text** (full sentences) 3. **Increase max_tokens** to 1500-2000 4. Try different voice descriptions ### 🔍 Known Issue This model may have specific requirements for the Hugging Face Spaces environment. If issues persist, the model may need additional configuration or dependencies. """) # Update voice description when preset changes voice_preset.change( fn=update_voice_description, inputs=[voice_preset], outputs=[voice_description] ) # Generate speech button generate_btn.click( fn=generate_speech, inputs=[text_input, voice_description, temperature, top_p, max_tokens], outputs=[audio_output] ) # Examples section gr.Markdown("### 📚 Examples") gr.Examples( examples=examples, inputs=[text_input, voice_description, temperature, top_p, max_tokens], outputs=[audio_output], fn=generate_speech, cache_examples=False ) gr.Markdown(""" --- ### About Maya1 Maya1 is a state-of-the-art open-source voice AI model that generates realistic, emotional speech from text. It uses natural language descriptions to design unique voices and supports emotional expressions through special tags. **Model:** [maya-research/maya1](https://huggingface.co/maya-research/maya1) ### ZeroGPU Integration This Space uses ZeroGPU for efficient GPU allocation. The GPU is only used during inference, allowing for cost-effective hosting while maintaining excellent performance. ### Important Note This model requires specific setup and may have compatibility requirements. If you encounter persistent issues, please check the [model card](https://huggingface.co/maya-research/maya1) for the latest information. """) if __name__ == "__main__": demo.launch()