File size: 14,327 Bytes
568817c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
import numpy as np
import tempfile
import os
import spaces

# Global variables for models
model = None
tokenizer = None
snac_model = None

def load_models():
    """Load models on first use"""
    global model, tokenizer, snac_model
    
    if model is None:
        print("Loading Maya1 model...")
        model = AutoModelForCausalLM.from_pretrained(
            "maya-research/maya1", 
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True
        )
        tokenizer = AutoTokenizer.from_pretrained(
            "maya-research/maya1",
            trust_remote_code=True
        )
        # Ensure pad token is set
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        print(f"Tokenizer vocab size: {len(tokenizer)}")
        print(f"Model vocab size: {model.config.vocab_size}")
        print(f"EOS token ID: {tokenizer.eos_token_id}")
        print(f"PAD token ID: {tokenizer.pad_token_id}")
    
    if snac_model is None:
        print("Loading SNAC audio decoder...")
        snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()

@spaces.GPU(duration=180)
def generate_speech(text, voice_description, temperature, top_p, max_tokens):
    """Generate speech from text using Maya1 model with ZeroGPU"""
    
    if not text.strip():
        raise gr.Error("Please enter some text to convert to speech!")
    
    if not voice_description.strip():
        voice_description = "Realistic voice with neutral tone and conversational pacing."
    
    try:
        # Load models if not already loaded
        load_models()
        
        # Move models to GPU
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)
        snac_model.to(device)
        
        # Create prompt - exactly as shown in docs
        prompt = f'<description="{voice_description}"> {text}'
        
        print(f"Prompt: {prompt}")
        
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        
        print(f"Input IDs shape: {inputs['input_ids'].shape}")
        print(f"Input length: {inputs['input_ids'].shape[1]}")
        print(f"First 20 input tokens: {inputs['input_ids'][0][:20].tolist()}")
        
        # Generate with settings from the documentation
        # Don't pass eos_token_id to allow generation to continue
        with torch.inference_mode():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs.get('attention_mask', None),
                max_new_tokens=int(max_tokens), 
                temperature=float(temperature), 
                top_p=float(top_p), 
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                # Don't set eos_token_id - let model decide when to stop
                # or set it to a value outside SNAC range
                eos_token_id=None,
                repetition_penalty=1.1,
            )
        
        print(f"Output shape: {outputs.shape}")
        print(f"Total output length: {outputs.shape[1]}")
        
        # Extract SNAC audio tokens
        generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
        
        print(f"Generated {len(generated_ids)} new tokens")
        print(f"First 50 generated IDs: {generated_ids[:50].tolist()}")
        
        # Filter for SNAC tokens in the correct range
        snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
        
        print(f"Total SNAC tokens: {len(snac_tokens)}")
        
        if len(snac_tokens) < 7:
            # Show all generated token IDs for debugging
            all_tokens = generated_ids.tolist()
            unique_tokens = sorted(list(set(all_tokens)))
            print(f"All unique token IDs ({len(unique_tokens)}): {unique_tokens[:100]}")
            
            # Check if any tokens are in expected range
            in_range = [t for t in all_tokens if 128266 <= t <= 156937]
            print(f"Tokens in SNAC range: {len(in_range)}")
            
            raise gr.Error(
                f"Model generated only {len(generated_ids)} tokens, with {len(snac_tokens)} SNAC audio tokens. "
                f"Token range: {min(all_tokens) if all_tokens else 'N/A'}-{max(all_tokens) if all_tokens else 'N/A'}. "
                f"Expected SNAC range: 128266-156937. This may indicate a model configuration issue. "
                f"Try: 1) Longer input text, 2) Increase max_tokens to 1500, 3) Different temperature (0.6-0.8)"
            )
        
        # Decode SNAC tokens to audio frames
        frames = len(snac_tokens) // 7
        
        print(f"Audio frames: {frames}")
        
        codes = [[], [], []]
        
        for i in range(frames):
            s = snac_tokens[i*7:(i+1)*7]
            codes[0].append((s[0]-128266) % 4096)
            codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
            codes[2].extend([
                (s[2]-128266) % 4096, 
                (s[3]-128266) % 4096, 
                (s[5]-128266) % 4096, 
                (s[6]-128266) % 4096
            ])
        
        # Generate final audio with SNAC decoder
        codes_tensor = [
            torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0) 
            for c in codes
        ]
        
        with torch.inference_mode():
            audio = snac_model.decoder(
                snac_model.quantizer.from_codes(codes_tensor)
            )[0, 0].cpu().numpy()
        
        print(f"Audio shape: {audio.shape}, Duration: {len(audio)/24000:.2f}s")
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
            sf.write(f.name, audio, 24000)
            return f.name
            
    except Exception as e:
        import traceback
        traceback.print_exc()
        raise gr.Error(f"Error generating speech: {str(e)}")

# Predefined voice presets
voice_presets = {
    "Male - American": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
    "Female - British": "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
    "Male - Deep": "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
    "Female - Energetic": "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
    "Neutral - Professional": "Professional neutral voice with clear articulation. Balanced pitch, crisp tone, measured pacing.",
    "Male - Warm": "Warm male voice with friendly tone. Medium pitch, smooth timbre, relaxed pacing.",
    "Custom": ""
}

def update_voice_description(preset):
    """Update voice description based on preset selection"""
    return voice_presets.get(preset, "")

# Example texts with emotions - increased max_tokens
examples = [
    [
        "Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
        "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
        0.7,
        0.9,
        1000
    ],
    [
        "I'm so excited to share this amazing news with you! This is incredible and wonderful!",
        "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
        0.8,
        0.9,
        1000
    ],
    [
        "In a world of constant change, one thing remains certain: the power of human connection and understanding.",
        "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
        0.6,
        0.85,
        1000
    ],
    [
        "The gentle breeze whispered through the trees as the sun set over the horizon in beautiful colors.",
        "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
        0.7,
        0.9,
        1000
    ]
]

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo:
    gr.HTML("""
        <div style="text-align: center; padding: 20px;">
            <h1>πŸŽ™οΈ Maya1 Text-to-Speech</h1>
            <p style="font-size: 18px; color: #666;">
                Generate emotional and realistic speech with natural language voice design
            </p>
            <p style="font-size: 14px; margin-top: 10px;">
                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;">anycoder</a>
            </p>
            <p style="font-size: 12px; color: #28a745; margin-top: 5px;">
                ⚑ Powered by ZeroGPU for efficient inference
            </p>
        </div>
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### πŸ“ Input")
            
            text_input = gr.Textbox(
                label="Text to Speak",
                placeholder="Enter your text here... You can use <laugh>, <sigh>, and other emotion tags!",
                lines=5,
                max_lines=10,
                value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions."
            )
            
            gr.Markdown("### 🎨 Voice Design")
            
            voice_preset = gr.Dropdown(
                choices=list(voice_presets.keys()),
                label="Voice Preset",
                value="Male - American",
                info="Select a preset or choose 'Custom' to write your own"
            )
            
            voice_description = gr.Textbox(
                label="Voice Description",
                placeholder="Describe the voice characteristics...",
                lines=3,
                value=voice_presets["Male - American"],
                info="Describe age, gender, accent, pitch, timbre, and pacing"
            )
            
            with gr.Accordion("βš™οΈ Advanced Settings", open=False):
                temperature = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.7,
                    step=0.1,
                    label="Temperature",
                    info="Controls randomness (higher may help with generation)"
                )
                
                top_p = gr.Slider(
                    minimum=0.5,
                    maximum=1.0,
                    value=0.9,
                    step=0.05,
                    label="Top P",
                    info="Nucleus sampling threshold"
                )
                
                max_tokens = gr.Slider(
                    minimum=500,
                    maximum=2000,
                    value=1000,
                    step=100,
                    label="Max Tokens",
                    info="Maximum length of generated audio (higher = longer audio)"
                )
            
            generate_btn = gr.Button("🎀 Generate Speech", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            gr.Markdown("### πŸ”Š Output")
            
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath",
                interactive=False
            )
            
            gr.Markdown("""
            ### πŸ’‘ Tips
            - **Use longer sentences** (20+ words recommended)
            - Start with **temperature=0.7** and **max_tokens=1000**
            - Use emotion tags like `<laugh>`, `<sigh>`, `<whisper>` in your text
            - Experiment with different voice descriptions
            - GPU allocation: 180 seconds per generation
            
            ### 🎭 Emotion Tags
            You can use various emotion tags in your text:
            - `<laugh>` - Laughter
            - `<sigh>` - Sighing
            - `<whisper>` - Whispering
            - `<shout>` - Shouting
            
            ### βš™οΈ Troubleshooting
            If generation fails with "not enough tokens":
            1. **Increase temperature** to 0.7-0.8
            2. **Use longer input text** (full sentences)
            3. **Increase max_tokens** to 1500-2000
            4. Try different voice descriptions
            
            ### πŸ” Known Issue
            This model may have specific requirements for the Hugging Face Spaces environment.
            If issues persist, the model may need additional configuration or dependencies.
            """)
    
    # Update voice description when preset changes
    voice_preset.change(
        fn=update_voice_description,
        inputs=[voice_preset],
        outputs=[voice_description]
    )
    
    # Generate speech button
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_description, temperature, top_p, max_tokens],
        outputs=[audio_output]
    )
    
    # Examples section
    gr.Markdown("### πŸ“š Examples")
    gr.Examples(
        examples=examples,
        inputs=[text_input, voice_description, temperature, top_p, max_tokens],
        outputs=[audio_output],
        fn=generate_speech,
        cache_examples=False
    )
    
    gr.Markdown("""
    ---
    ### About Maya1
    Maya1 is a state-of-the-art open-source voice AI model that generates realistic, emotional speech from text. 
    It uses natural language descriptions to design unique voices and supports emotional expressions through special tags.
    
    **Model:** [maya-research/maya1](https://huggingface.co/maya-research/maya1)
    
    ### ZeroGPU Integration
    This Space uses ZeroGPU for efficient GPU allocation. The GPU is only used during inference, 
    allowing for cost-effective hosting while maintaining excellent performance.
    
    ### Important Note
    This model requires specific setup and may have compatibility requirements. If you encounter persistent
    issues, please check the [model card](https://huggingface.co/maya-research/maya1) for the latest information.
    """)

if __name__ == "__main__":
    demo.launch()