Spaces:

Javedalam
/

gen-k8x8mz2l

Runtime error

App Files Files Community

Javedalam commited on Nov 5

Commit

5d5100f

verified ·

1 Parent(s): 69841d9

Create app.py

Browse files

Files changed (1) hide show

app.py +167 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from snac import SNAC
+import soundfile as sf
+import tempfile
+import spaces
+# --- global handles (lazy-loaded) ---
+model = None
+tokenizer = None
+snac_model = None
+def load_models(device: str):
+    """Load Maya1 and SNAC once, with device-aware dtype."""
+    global model, tokenizer, snac_model
+    if tokenizer is None or model is None:
+        dtype = torch.bfloat16 if device == "cuda" else torch.float32
+        print(f"[load_models] loading Maya1 (dtype={dtype}, device={device})")
+        # device_map only on CUDA; on CPU keep None to avoid accelerate errors
+        device_map = "auto" if device == "cuda" else None
+        model = AutoModelForCausalLM.from_pretrained(
+            "maya-research/maya1",
+            torch_dtype=dtype,
+            device_map=device_map,
+            trust_remote_code=True,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            "maya-research/maya1",
+            trust_remote_code=True,
+        )
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+    if snac_model is None:
+        print("[load_models] loading SNAC 24kHz decoder")
+        snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
+        # move later inside handler (after ZeroGPU alloc)
+        return snac
+    return None
+@spaces.GPU(duration=180)
+def generate_speech(text, voice_description, temperature, top_p, max_tokens):
+    if not text.strip():
+        raise gr.Error("Enter some text.")
+    if not voice_description.strip():
+        voice_description = "Realistic voice with neutral tone and conversational pacing."
+    # ZeroGPU gives us CUDA during this call
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # load / ensure models exist
+    snac_fresh = load_models(device)  # returns SNAC if created
+    global snac_model
+    if snac_fresh is not None:
+        snac_model = snac_fresh
+    # move models to the active device (ZeroGPU alloc happened)
+    if device == "cuda":
+        model.to(device)
+        snac_model.to(device)
+    # prompt exactly like the model card
+    prompt = f'<description="{voice_description}"> {text}'
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    with torch.inference_mode():
+        outputs = model.generate(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs.get("attention_mask"),
+            max_new_tokens=int(max_tokens),
+            temperature=float(temperature),
+            top_p=float(top_p),
+            do_sample=True,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=None,
+            repetition_penalty=1.1,
+        )
+    # SNAC token extraction (7-token frames) — as per model card
+    generated_ids = outputs[0, inputs["input_ids"].shape[1]:]
+    snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
+    frames = len(snac_tokens) // 7
+    if frames == 0:
+        raise gr.Error("No SNAC tokens generated. Try longer text and max_tokens=1200–1500.")
+    codes = [[], [], []]
+    for i in range(frames):
+        s = snac_tokens[i*7:(i+1)*7]
+        codes[0].append((s[0]-128266) % 4096)
+        codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
+        codes[2].extend([
+            (s[2]-128266) % 4096,
+            (s[3]-128266) % 4096,
+            (s[5]-128266) % 4096,
+            (s[6]-128266) % 4096,
+        ])
+    codes_tensor = [torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0) for c in codes]
+    with torch.inference_mode():
+        audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
+    # write to wav; return filepath for gr.Audio(type="filepath")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+        sf.write(f.name, audio, 24000)
+        return f.name
+# ------------------- UI -------------------
+voice_presets = {
+    "Male - American": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
+    "Female - British": "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
+    "Male - Deep": "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
+    "Female - Energetic": "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
+    "Neutral - Professional": "Professional neutral voice with clear articulation. Balanced pitch, crisp tone, measured pacing.",
+    "Custom": ""
+}
+def update_voice_description(preset): return voice_presets.get(preset, "")
+with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo:
+    gr.HTML("""
+        <div style="text-align:center;padding:16px">
+            <h1>🎙️ Maya1 Text-to-Speech</h1>
+            <p style="color:#666">Generate emotional, realistic speech with natural-language voice design</p>
+            <p style="font-size:12px;color:#28a745">⚡ ZeroGPU inference</p>
+        </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            text_input = gr.Textbox(
+                label="Text to Speak",
+                value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
+                lines=5,
+            )
+            voice_preset = gr.Dropdown(choices=list(voice_presets.keys()),
+                                       value="Male - American",
+                                       label="Voice Preset")
+            voice_description = gr.Textbox(
+                label="Voice Description",
+                value=voice_presets["Male - American"],
+                lines=3,
+            )
+            with gr.Accordion("Advanced", open=False):
+                temperature = gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Temperature")
+                top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
+                max_tokens = gr.Slider(500, 2000, value=1000, step=100, label="Max tokens")
+            generate_btn = gr.Button("🎤 Generate Speech", variant="primary")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(label="Generated Speech", type="filepath", interactive=False)
+    voice_preset.change(fn=update_voice_description, inputs=[voice_preset], outputs=[voice_description])
+    generate_btn.click(fn=generate_speech,
+                       inputs=[text_input, voice_description, temperature, top_p, max_tokens],
+                       outputs=[audio_output])
+# Register an explicit API endpoint so Spaces never shows “No API found”
+gr.api(fn=generate_speech, name="generate_speech")
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch(show_error=True)