Spaces:
Runtime error
Runtime error
File size: 14,327 Bytes
568817c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
import numpy as np
import tempfile
import os
import spaces
# Global variables for models
model = None
tokenizer = None
snac_model = None
def load_models():
"""Load models on first use"""
global model, tokenizer, snac_model
if model is None:
print("Loading Maya1 model...")
model = AutoModelForCausalLM.from_pretrained(
"maya-research/maya1",
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
"maya-research/maya1",
trust_remote_code=True
)
# Ensure pad token is set
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Model vocab size: {model.config.vocab_size}")
print(f"EOS token ID: {tokenizer.eos_token_id}")
print(f"PAD token ID: {tokenizer.pad_token_id}")
if snac_model is None:
print("Loading SNAC audio decoder...")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
@spaces.GPU(duration=180)
def generate_speech(text, voice_description, temperature, top_p, max_tokens):
"""Generate speech from text using Maya1 model with ZeroGPU"""
if not text.strip():
raise gr.Error("Please enter some text to convert to speech!")
if not voice_description.strip():
voice_description = "Realistic voice with neutral tone and conversational pacing."
try:
# Load models if not already loaded
load_models()
# Move models to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
snac_model.to(device)
# Create prompt - exactly as shown in docs
prompt = f'<description="{voice_description}"> {text}'
print(f"Prompt: {prompt}")
# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(device)
print(f"Input IDs shape: {inputs['input_ids'].shape}")
print(f"Input length: {inputs['input_ids'].shape[1]}")
print(f"First 20 input tokens: {inputs['input_ids'][0][:20].tolist()}")
# Generate with settings from the documentation
# Don't pass eos_token_id to allow generation to continue
with torch.inference_mode():
outputs = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs.get('attention_mask', None),
max_new_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
# Don't set eos_token_id - let model decide when to stop
# or set it to a value outside SNAC range
eos_token_id=None,
repetition_penalty=1.1,
)
print(f"Output shape: {outputs.shape}")
print(f"Total output length: {outputs.shape[1]}")
# Extract SNAC audio tokens
generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
print(f"Generated {len(generated_ids)} new tokens")
print(f"First 50 generated IDs: {generated_ids[:50].tolist()}")
# Filter for SNAC tokens in the correct range
snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
print(f"Total SNAC tokens: {len(snac_tokens)}")
if len(snac_tokens) < 7:
# Show all generated token IDs for debugging
all_tokens = generated_ids.tolist()
unique_tokens = sorted(list(set(all_tokens)))
print(f"All unique token IDs ({len(unique_tokens)}): {unique_tokens[:100]}")
# Check if any tokens are in expected range
in_range = [t for t in all_tokens if 128266 <= t <= 156937]
print(f"Tokens in SNAC range: {len(in_range)}")
raise gr.Error(
f"Model generated only {len(generated_ids)} tokens, with {len(snac_tokens)} SNAC audio tokens. "
f"Token range: {min(all_tokens) if all_tokens else 'N/A'}-{max(all_tokens) if all_tokens else 'N/A'}. "
f"Expected SNAC range: 128266-156937. This may indicate a model configuration issue. "
f"Try: 1) Longer input text, 2) Increase max_tokens to 1500, 3) Different temperature (0.6-0.8)"
)
# Decode SNAC tokens to audio frames
frames = len(snac_tokens) // 7
print(f"Audio frames: {frames}")
codes = [[], [], []]
for i in range(frames):
s = snac_tokens[i*7:(i+1)*7]
codes[0].append((s[0]-128266) % 4096)
codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
codes[2].extend([
(s[2]-128266) % 4096,
(s[3]-128266) % 4096,
(s[5]-128266) % 4096,
(s[6]-128266) % 4096
])
# Generate final audio with SNAC decoder
codes_tensor = [
torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0)
for c in codes
]
with torch.inference_mode():
audio = snac_model.decoder(
snac_model.quantizer.from_codes(codes_tensor)
)[0, 0].cpu().numpy()
print(f"Audio shape: {audio.shape}, Duration: {len(audio)/24000:.2f}s")
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
sf.write(f.name, audio, 24000)
return f.name
except Exception as e:
import traceback
traceback.print_exc()
raise gr.Error(f"Error generating speech: {str(e)}")
# Predefined voice presets
voice_presets = {
"Male - American": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
"Female - British": "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
"Male - Deep": "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
"Female - Energetic": "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
"Neutral - Professional": "Professional neutral voice with clear articulation. Balanced pitch, crisp tone, measured pacing.",
"Male - Warm": "Warm male voice with friendly tone. Medium pitch, smooth timbre, relaxed pacing.",
"Custom": ""
}
def update_voice_description(preset):
"""Update voice description based on preset selection"""
return voice_presets.get(preset, "")
# Example texts with emotions - increased max_tokens
examples = [
[
"Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
"Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
0.7,
0.9,
1000
],
[
"I'm so excited to share this amazing news with you! This is incredible and wonderful!",
"Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
0.8,
0.9,
1000
],
[
"In a world of constant change, one thing remains certain: the power of human connection and understanding.",
"Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
0.6,
0.85,
1000
],
[
"The gentle breeze whispered through the trees as the sun set over the horizon in beautiful colors.",
"Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
0.7,
0.9,
1000
]
]
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo:
gr.HTML("""
<div style="text-align: center; padding: 20px;">
<h1>ποΈ Maya1 Text-to-Speech</h1>
<p style="font-size: 18px; color: #666;">
Generate emotional and realistic speech with natural language voice design
</p>
<p style="font-size: 14px; margin-top: 10px;">
Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;">anycoder</a>
</p>
<p style="font-size: 12px; color: #28a745; margin-top: 5px;">
β‘ Powered by ZeroGPU for efficient inference
</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### π Input")
text_input = gr.Textbox(
label="Text to Speak",
placeholder="Enter your text here... You can use <laugh>, <sigh>, and other emotion tags!",
lines=5,
max_lines=10,
value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions."
)
gr.Markdown("### π¨ Voice Design")
voice_preset = gr.Dropdown(
choices=list(voice_presets.keys()),
label="Voice Preset",
value="Male - American",
info="Select a preset or choose 'Custom' to write your own"
)
voice_description = gr.Textbox(
label="Voice Description",
placeholder="Describe the voice characteristics...",
lines=3,
value=voice_presets["Male - American"],
info="Describe age, gender, accent, pitch, timbre, and pacing"
)
with gr.Accordion("βοΈ Advanced Settings", open=False):
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.7,
step=0.1,
label="Temperature",
info="Controls randomness (higher may help with generation)"
)
top_p = gr.Slider(
minimum=0.5,
maximum=1.0,
value=0.9,
step=0.05,
label="Top P",
info="Nucleus sampling threshold"
)
max_tokens = gr.Slider(
minimum=500,
maximum=2000,
value=1000,
step=100,
label="Max Tokens",
info="Maximum length of generated audio (higher = longer audio)"
)
generate_btn = gr.Button("π€ Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### π Output")
audio_output = gr.Audio(
label="Generated Speech",
type="filepath",
interactive=False
)
gr.Markdown("""
### π‘ Tips
- **Use longer sentences** (20+ words recommended)
- Start with **temperature=0.7** and **max_tokens=1000**
- Use emotion tags like `<laugh>`, `<sigh>`, `<whisper>` in your text
- Experiment with different voice descriptions
- GPU allocation: 180 seconds per generation
### π Emotion Tags
You can use various emotion tags in your text:
- `<laugh>` - Laughter
- `<sigh>` - Sighing
- `<whisper>` - Whispering
- `<shout>` - Shouting
### βοΈ Troubleshooting
If generation fails with "not enough tokens":
1. **Increase temperature** to 0.7-0.8
2. **Use longer input text** (full sentences)
3. **Increase max_tokens** to 1500-2000
4. Try different voice descriptions
### π Known Issue
This model may have specific requirements for the Hugging Face Spaces environment.
If issues persist, the model may need additional configuration or dependencies.
""")
# Update voice description when preset changes
voice_preset.change(
fn=update_voice_description,
inputs=[voice_preset],
outputs=[voice_description]
)
# Generate speech button
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_description, temperature, top_p, max_tokens],
outputs=[audio_output]
)
# Examples section
gr.Markdown("### π Examples")
gr.Examples(
examples=examples,
inputs=[text_input, voice_description, temperature, top_p, max_tokens],
outputs=[audio_output],
fn=generate_speech,
cache_examples=False
)
gr.Markdown("""
---
### About Maya1
Maya1 is a state-of-the-art open-source voice AI model that generates realistic, emotional speech from text.
It uses natural language descriptions to design unique voices and supports emotional expressions through special tags.
**Model:** [maya-research/maya1](https://huggingface.co/maya-research/maya1)
### ZeroGPU Integration
This Space uses ZeroGPU for efficient GPU allocation. The GPU is only used during inference,
allowing for cost-effective hosting while maintaining excellent performance.
### Important Note
This model requires specific setup and may have compatibility requirements. If you encounter persistent
issues, please check the [model card](https://huggingface.co/maya-research/maya1) for the latest information.
""")
if __name__ == "__main__":
demo.launch() |