littlebird13's picture
Update app.py
6f10747 verified
import gradio as gr
import os
import requests
import base64
import pathlib
import threading
import tempfile
from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat
import dashscope
import wave
import numpy as np
# ======= Constants Configuration =======
DEFAULT_TARGET_MODEL = "qwen3-tts-vc-realtime-2025-11-27"
DEFAULT_PREFERRED_NAME = "custom_voice"
DEFAULT_AUDIO_MIME_TYPE = "audio/wav"
def init_dashscope_api_key():
"""Initialize the API key for dashscope SDK"""
api_key = os.environ['API_KEY']
if not api_key:
raise ValueError("Please set the environment variable DASHSCOPE_API_KEY")
dashscope.api_key = api_key
return api_key
def create_voice(file_path: str,
target_model: str = DEFAULT_TARGET_MODEL,
preferred_name: str = DEFAULT_PREFERRED_NAME,
audio_mime_type: str = DEFAULT_AUDIO_MIME_TYPE) -> str:
"""Create voice and return the voice parameter"""
api_key = os.environ['API_KEY']
file_path_obj = pathlib.Path(file_path)
if not file_path_obj.exists():
raise FileNotFoundError(f"Audio file not found: {file_path}")
base64_str = base64.b64encode(file_path_obj.read_bytes()).decode()
data_uri = f"data:{audio_mime_type};base64,{base64_str}"
url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
payload = {
"model": "qwen-voice-enrollment",
"input": {
"action": "create",
"target_model": target_model,
"preferred_name": preferred_name,
"audio": {"data": data_uri}
}
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
# Create session and configure retry and SSL
session = requests.Session()
# If SSL errors persist, temporarily change to False (for testing only)
session.verify = True # Enable SSL verification
# Configure retry strategy
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["POST"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)
try:
resp = session.post(url, json=payload, headers=headers, timeout=60)
if resp.status_code != 200:
raise RuntimeError(f"Failed to create voice: {resp.status_code}, {resp.text}")
return resp.json()["output"]["voice"]
except requests.exceptions.SSLError as e:
raise RuntimeError(f"SSL connection error: {e}. Please check network environment or try using a proxy")
except requests.exceptions.Timeout as e:
raise RuntimeError(f"Request timeout: {e}")
except (KeyError, ValueError) as e:
raise RuntimeError(f"Failed to parse voice response: {e}")
finally:
session.close()
class TTSCallback(QwenTtsRealtimeCallback):
"""TTS streaming callback for collecting audio data"""
def __init__(self):
self.complete_event = threading.Event()
self.audio_chunks = []
self.error_msg = None
def on_open(self) -> None:
print('[TTS] Connection established')
def on_close(self, close_status_code, close_msg) -> None:
print(f'[TTS] Connection closed code={close_status_code}, msg={close_msg}')
def on_event(self, response: dict) -> None:
try:
event_type = response.get('type', '')
if event_type == 'session.created':
print(f'[TTS] Session started: {response["session"]["id"]}')
elif event_type == 'response.audio.delta':
audio_data = base64.b64decode(response['delta'])
self.audio_chunks.append(audio_data)
elif event_type == 'response.done':
print('[TTS] Response completed')
elif event_type == 'session.finished':
print('[TTS] Session finished')
self.complete_event.set()
except Exception as e:
self.error_msg = str(e)
print(f'[Error] Exception while processing callback event: {e}')
self.complete_event.set()
def wait_for_finished(self):
self.complete_event.wait()
def get_audio_data(self):
"""Return the synthesized audio data"""
return b''.join(self.audio_chunks)
def synthesize_speech(audio_file, text_input):
"""
Main function for speech synthesis
Args:
audio_file: Path to the recorded audio file (from Gradio audio component)
text_input: Text to synthesize
Returns:
Path to the synthesized audio file
"""
try:
if not audio_file:
return None, "❌ Please record a voice sample first"
if not text_input or text_input.strip() == "":
return None, "❌ Please enter the text to synthesize"
# Initialize API Key
init_dashscope_api_key()
# Create voice clone
status_msg = "🎀 Creating voice clone..."
print(status_msg)
voice_id = create_voice(audio_file, audio_mime_type="audio/wav")
# Initialize TTS
status_msg = "πŸ”Š Synthesizing speech..."
print(status_msg)
callback = TTSCallback()
qwen_tts_realtime = QwenTtsRealtime(
model=DEFAULT_TARGET_MODEL,
callback=callback,
url='wss://dashscope.aliyuncs.com/api-ws/v1/realtime'
)
qwen_tts_realtime.connect()
# Update session configuration
qwen_tts_realtime.update_session(
voice=voice_id,
response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
mode='server_commit'
)
# Send text
qwen_tts_realtime.append_text(text_input)
qwen_tts_realtime.finish()
# Wait for completion
callback.wait_for_finished()
if callback.error_msg:
return None, f"❌ Synthesis failed: {callback.error_msg}"
# Get audio data and save as WAV file
audio_data = callback.get_audio_data()
if not audio_data:
return None, "❌ No audio data generated"
# Create temporary file to save audio
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
output_path = tmp_file.name
# Write WAV file header
with wave.open(output_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16bit
wav_file.setframerate(24000) # 24kHz
wav_file.writeframes(audio_data)
success_msg = f"βœ… Synthesis successful! Session ID: {qwen_tts_realtime.get_session_id()}"
print(success_msg)
return output_path, success_msg
except Exception as e:
error_msg = f"❌ An error occurred: {str(e)}"
print(error_msg)
return None, error_msg
# ======= Gradio Interface =======
def create_gradio_interface():
"""Create Gradio interface"""
with gr.Blocks(title="Qwen Voice Cloning and Synthesis", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸŽ™οΈ Qwen Voice Cloning and Synthesis
**Usage Steps:**
1. Click the microphone icon to record a voice sample (recommended 10-30 seconds, clear and natural)
2. Enter the text content to synthesize
3. Click the "Start Synthesis" button
4. Wait for synthesis to complete, then play or download the result
**Notes:**
- Please ensure the environment variable `DASHSCOPE_API_KEY` is set
- Better recording quality leads to better synthesis results
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Step 1: Record Voice Sample")
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record Voice",
format="wav"
)
gr.Markdown("### Step 2: Enter Text to Synthesize")
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Please enter the text content to synthesize...",
lines=5,
value="Hello, this is a voice synthesized using voice cloning technology."
)
submit_btn = gr.Button("🎡 Start Synthesis", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Synthesis Result")
status_output = gr.Textbox(
label="Status Information",
interactive=False,
lines=2
)
audio_output = gr.Audio(
label="Synthesized Voice",
type="filepath"
)
# Bind events
submit_btn.click(
fn=synthesize_speech,
inputs=[audio_input, text_input],
outputs=[audio_output, status_output]
)
gr.Markdown("""
---
πŸ’‘ **Tip:** For better results, please ensure a quiet recording environment and clear, natural pronunciation.
""")
return demo
if __name__ == "__main__":
# Check API Key
try:
init_dashscope_api_key()
print("βœ… API Key verified successfully")
except ValueError as e:
print(f"⚠️ Warning: {e}")
print("Please set the environment variable: export DASHSCOPE_API_KEY='your-api-key'")
demo = create_gradio_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
ssr_mode=False
)