Spaces:

Javedalam
/

gen-k8x8mz2l

Runtime error

App Files Files Community

gen-k8x8mz2l / app.py.old

Javedalam

Rename app.py to app.py.old

69841d9 verified about 1 month ago

raw

history blame contribute delete

14.3 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from snac import SNAC
	import soundfile as sf
	import numpy as np
	import tempfile
	import os
	import spaces

	# Global variables for models
	model = None
	tokenizer = None
	snac_model = None

	def load_models():
	"""Load models on first use"""
	global model, tokenizer, snac_model

	if model is None:
	print("Loading Maya1 model...")
	model = AutoModelForCausalLM.from_pretrained(
	"maya-research/maya1",
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)
	tokenizer = AutoTokenizer.from_pretrained(
	"maya-research/maya1",
	trust_remote_code=True
	)
	# Ensure pad token is set
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print(f"Tokenizer vocab size: {len(tokenizer)}")
	print(f"Model vocab size: {model.config.vocab_size}")
	print(f"EOS token ID: {tokenizer.eos_token_id}")
	print(f"PAD token ID: {tokenizer.pad_token_id}")

	if snac_model is None:
	print("Loading SNAC audio decoder...")
	snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()

	@spaces.GPU(duration=180)
	def generate_speech(text, voice_description, temperature, top_p, max_tokens):
	"""Generate speech from text using Maya1 model with ZeroGPU"""

	if not text.strip():
	raise gr.Error("Please enter some text to convert to speech!")

	if not voice_description.strip():
	voice_description = "Realistic voice with neutral tone and conversational pacing."

	try:
	# Load models if not already loaded
	load_models()

	# Move models to GPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	snac_model.to(device)

	# Create prompt - exactly as shown in docs
	prompt = f'<description="{voice_description}"> {text}'

	print(f"Prompt: {prompt}")

	# Tokenize input
	inputs = tokenizer(prompt, return_tensors="pt").to(device)

	print(f"Input IDs shape: {inputs['input_ids'].shape}")
	print(f"Input length: {inputs['input_ids'].shape[1]}")
	print(f"First 20 input tokens: {inputs['input_ids'][0][:20].tolist()}")

	# Generate with settings from the documentation
	# Don't pass eos_token_id to allow generation to continue
	with torch.inference_mode():
	outputs = model.generate(
	input_ids=inputs['input_ids'],
	attention_mask=inputs.get('attention_mask', None),
	max_new_tokens=int(max_tokens),
	temperature=float(temperature),
	top_p=float(top_p),
	do_sample=True,
	pad_token_id=tokenizer.pad_token_id,
	# Don't set eos_token_id - let model decide when to stop
	# or set it to a value outside SNAC range
	eos_token_id=None,
	repetition_penalty=1.1,
	)

	print(f"Output shape: {outputs.shape}")
	print(f"Total output length: {outputs.shape[1]}")

	# Extract SNAC audio tokens
	generated_ids = outputs[0, inputs['input_ids'].shape[1]:]

	print(f"Generated {len(generated_ids)} new tokens")
	print(f"First 50 generated IDs: {generated_ids[:50].tolist()}")

	# Filter for SNAC tokens in the correct range
	snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]

	print(f"Total SNAC tokens: {len(snac_tokens)}")

	if len(snac_tokens) < 7:
	# Show all generated token IDs for debugging
	all_tokens = generated_ids.tolist()
	unique_tokens = sorted(list(set(all_tokens)))
	print(f"All unique token IDs ({len(unique_tokens)}): {unique_tokens[:100]}")

	# Check if any tokens are in expected range
	in_range = [t for t in all_tokens if 128266 <= t <= 156937]
	print(f"Tokens in SNAC range: {len(in_range)}")

	raise gr.Error(
	f"Model generated only {len(generated_ids)} tokens, with {len(snac_tokens)} SNAC audio tokens. "
	f"Token range: {min(all_tokens) if all_tokens else 'N/A'}-{max(all_tokens) if all_tokens else 'N/A'}. "
	f"Expected SNAC range: 128266-156937. This may indicate a model configuration issue. "
	f"Try: 1) Longer input text, 2) Increase max_tokens to 1500, 3) Different temperature (0.6-0.8)"
	)

	# Decode SNAC tokens to audio frames
	frames = len(snac_tokens) // 7

	print(f"Audio frames: {frames}")

	codes = [[], [], []]

	for i in range(frames):
	s = snac_tokens[i7:(i+1)7]
	codes[0].append((s[0]-128266) % 4096)
	codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
	codes[2].extend([
	(s[2]-128266) % 4096,
	(s[3]-128266) % 4096,
	(s[5]-128266) % 4096,
	(s[6]-128266) % 4096
	])

	# Generate final audio with SNAC decoder
	codes_tensor = [
	torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0)
	for c in codes
	]

	with torch.inference_mode():
	audio = snac_model.decoder(
	snac_model.quantizer.from_codes(codes_tensor)
	)[0, 0].cpu().numpy()

	print(f"Audio shape: {audio.shape}, Duration: {len(audio)/24000:.2f}s")

	# Save to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
	sf.write(f.name, audio, 24000)
	return f.name

	except Exception as e:
	import traceback
	traceback.print_exc()
	raise gr.Error(f"Error generating speech: {str(e)}")

	# Predefined voice presets
	voice_presets = {
	"Male - American": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
	"Female - British": "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
	"Male - Deep": "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
	"Female - Energetic": "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
	"Neutral - Professional": "Professional neutral voice with clear articulation. Balanced pitch, crisp tone, measured pacing.",
	"Male - Warm": "Warm male voice with friendly tone. Medium pitch, smooth timbre, relaxed pacing.",
	"Custom": ""
	}

	def update_voice_description(preset):
	"""Update voice description based on preset selection"""
	return voice_presets.get(preset, "")

	# Example texts with emotions - increased max_tokens
	examples = [
	[
	"Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
	"Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
	0.7,
	0.9,
	1000
	],
	[
	"I'm so excited to share this amazing news with you! This is incredible and wonderful!",
	"Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
	0.8,
	0.9,
	1000
	],
	[
	"In a world of constant change, one thing remains certain: the power of human connection and understanding.",
	"Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
	0.6,
	0.85,
	1000
	],
	[
	"The gentle breeze whispered through the trees as the sun set over the horizon in beautiful colors.",
	"Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
	0.7,
	0.9,
	1000
	]
	]

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo:
	gr.HTML("""
	<div style="text-align: center; padding: 20px;">
	<h1>🎙️ Maya1 Text-to-Speech</h1>
	<p style="font-size: 18px; color: #666;">
	Generate emotional and realistic speech with natural language voice design
	</p>
	<p style="font-size: 14px; margin-top: 10px;">
	Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;">anycoder</a>
	</p>
	<p style="font-size: 12px; color: #28a745; margin-top: 5px;">
	⚡ Powered by ZeroGPU for efficient inference
	</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📝 Input")

	text_input = gr.Textbox(
	label="Text to Speak",
	placeholder="Enter your text here... You can use <laugh>, <sigh>, and other emotion tags!",
	lines=5,
	max_lines=10,
	value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions."
	)

	gr.Markdown("### 🎨 Voice Design")

	voice_preset = gr.Dropdown(
	choices=list(voice_presets.keys()),
	label="Voice Preset",
	value="Male - American",
	info="Select a preset or choose 'Custom' to write your own"
	)

	voice_description = gr.Textbox(
	label="Voice Description",
	placeholder="Describe the voice characteristics...",
	lines=3,
	value=voice_presets["Male - American"],
	info="Describe age, gender, accent, pitch, timbre, and pacing"
	)

	with gr.Accordion("⚙️ Advanced Settings", open=False):
	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Controls randomness (higher may help with generation)"
	)

	top_p = gr.Slider(
	minimum=0.5,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top P",
	info="Nucleus sampling threshold"
	)

	max_tokens = gr.Slider(
	minimum=500,
	maximum=2000,
	value=1000,
	step=100,
	label="Max Tokens",
	info="Maximum length of generated audio (higher = longer audio)"
	)

	generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### 🔊 Output")

	audio_output = gr.Audio(
	label="Generated Speech",
	type="filepath",
	interactive=False
	)

	gr.Markdown("""
	### 💡 Tips
	- Use longer sentences (20+ words recommended)
	- Start with temperature=0.7 and max_tokens=1000
	- Use emotion tags like `<laugh>`, `<sigh>`, `<whisper>` in your text
	- Experiment with different voice descriptions
	- GPU allocation: 180 seconds per generation

	### 🎭 Emotion Tags
	You can use various emotion tags in your text:
	- `<laugh>` - Laughter
	- `<sigh>` - Sighing
	- `<whisper>` - Whispering
	- `<shout>` - Shouting

	### ⚙️ Troubleshooting
	If generation fails with "not enough tokens":
	1. Increase temperature to 0.7-0.8
	2. Use longer input text (full sentences)
	3. Increase max_tokens to 1500-2000
	4. Try different voice descriptions

	### 🔍 Known Issue
	This model may have specific requirements for the Hugging Face Spaces environment.
	If issues persist, the model may need additional configuration or dependencies.
	""")

	# Update voice description when preset changes
	voice_preset.change(
	fn=update_voice_description,
	inputs=[voice_preset],
	outputs=[voice_description]
	)

	# Generate speech button
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_description, temperature, top_p, max_tokens],
	outputs=[audio_output]
	)

	# Examples section
	gr.Markdown("### 📚 Examples")
	gr.Examples(
	examples=examples,
	inputs=[text_input, voice_description, temperature, top_p, max_tokens],
	outputs=[audio_output],
	fn=generate_speech,
	cache_examples=False
	)

	gr.Markdown("""
	---
	### About Maya1
	Maya1 is a state-of-the-art open-source voice AI model that generates realistic, emotional speech from text.
	It uses natural language descriptions to design unique voices and supports emotional expressions through special tags.

	Model: [maya-research/maya1](https://huggingface.co/maya-research/maya1)

	### ZeroGPU Integration
	This Space uses ZeroGPU for efficient GPU allocation. The GPU is only used during inference,
	allowing for cost-effective hosting while maintaining excellent performance.

	### Important Note
	This model requires specific setup and may have compatibility requirements. If you encounter persistent
	issues, please check the [model card](https://huggingface.co/maya-research/maya1) for the latest information.
	""")

	if __name__ == "__main__":
	demo.launch()