Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import requests | |
| import base64 | |
| import pathlib | |
| import threading | |
| import tempfile | |
| from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat | |
| import dashscope | |
| import wave | |
| import numpy as np | |
| # ======= Constants Configuration ======= | |
| DEFAULT_TARGET_MODEL = "qwen3-tts-vc-realtime-2025-11-27" | |
| DEFAULT_PREFERRED_NAME = "custom_voice" | |
| DEFAULT_AUDIO_MIME_TYPE = "audio/wav" | |
| def init_dashscope_api_key(): | |
| """Initialize the API key for dashscope SDK""" | |
| api_key = os.environ['API_KEY'] | |
| if not api_key: | |
| raise ValueError("Please set the environment variable DASHSCOPE_API_KEY") | |
| dashscope.api_key = api_key | |
| return api_key | |
| def create_voice(file_path: str, | |
| target_model: str = DEFAULT_TARGET_MODEL, | |
| preferred_name: str = DEFAULT_PREFERRED_NAME, | |
| audio_mime_type: str = DEFAULT_AUDIO_MIME_TYPE) -> str: | |
| """Create voice and return the voice parameter""" | |
| api_key = os.environ['API_KEY'] | |
| file_path_obj = pathlib.Path(file_path) | |
| if not file_path_obj.exists(): | |
| raise FileNotFoundError(f"Audio file not found: {file_path}") | |
| base64_str = base64.b64encode(file_path_obj.read_bytes()).decode() | |
| data_uri = f"data:{audio_mime_type};base64,{base64_str}" | |
| url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization" | |
| payload = { | |
| "model": "qwen-voice-enrollment", | |
| "input": { | |
| "action": "create", | |
| "target_model": target_model, | |
| "preferred_name": preferred_name, | |
| "audio": {"data": data_uri} | |
| } | |
| } | |
| headers = { | |
| "Authorization": f"Bearer {api_key}", | |
| "Content-Type": "application/json" | |
| } | |
| # Create session and configure retry and SSL | |
| session = requests.Session() | |
| # If SSL errors persist, temporarily change to False (for testing only) | |
| session.verify = True # Enable SSL verification | |
| # Configure retry strategy | |
| from requests.adapters import HTTPAdapter | |
| from urllib3.util.retry import Retry | |
| retry_strategy = Retry( | |
| total=3, | |
| backoff_factor=1, | |
| status_forcelist=[429, 500, 502, 503, 504], | |
| allowed_methods=["POST"] | |
| ) | |
| adapter = HTTPAdapter(max_retries=retry_strategy) | |
| session.mount("https://", adapter) | |
| session.mount("http://", adapter) | |
| try: | |
| resp = session.post(url, json=payload, headers=headers, timeout=60) | |
| if resp.status_code != 200: | |
| raise RuntimeError(f"Failed to create voice: {resp.status_code}, {resp.text}") | |
| return resp.json()["output"]["voice"] | |
| except requests.exceptions.SSLError as e: | |
| raise RuntimeError(f"SSL connection error: {e}. Please check network environment or try using a proxy") | |
| except requests.exceptions.Timeout as e: | |
| raise RuntimeError(f"Request timeout: {e}") | |
| except (KeyError, ValueError) as e: | |
| raise RuntimeError(f"Failed to parse voice response: {e}") | |
| finally: | |
| session.close() | |
| class TTSCallback(QwenTtsRealtimeCallback): | |
| """TTS streaming callback for collecting audio data""" | |
| def __init__(self): | |
| self.complete_event = threading.Event() | |
| self.audio_chunks = [] | |
| self.error_msg = None | |
| def on_open(self) -> None: | |
| print('[TTS] Connection established') | |
| def on_close(self, close_status_code, close_msg) -> None: | |
| print(f'[TTS] Connection closed code={close_status_code}, msg={close_msg}') | |
| def on_event(self, response: dict) -> None: | |
| try: | |
| event_type = response.get('type', '') | |
| if event_type == 'session.created': | |
| print(f'[TTS] Session started: {response["session"]["id"]}') | |
| elif event_type == 'response.audio.delta': | |
| audio_data = base64.b64decode(response['delta']) | |
| self.audio_chunks.append(audio_data) | |
| elif event_type == 'response.done': | |
| print('[TTS] Response completed') | |
| elif event_type == 'session.finished': | |
| print('[TTS] Session finished') | |
| self.complete_event.set() | |
| except Exception as e: | |
| self.error_msg = str(e) | |
| print(f'[Error] Exception while processing callback event: {e}') | |
| self.complete_event.set() | |
| def wait_for_finished(self): | |
| self.complete_event.wait() | |
| def get_audio_data(self): | |
| """Return the synthesized audio data""" | |
| return b''.join(self.audio_chunks) | |
| def synthesize_speech(audio_file, text_input): | |
| """ | |
| Main function for speech synthesis | |
| Args: | |
| audio_file: Path to the recorded audio file (from Gradio audio component) | |
| text_input: Text to synthesize | |
| Returns: | |
| Path to the synthesized audio file | |
| """ | |
| try: | |
| if not audio_file: | |
| return None, "β Please record a voice sample first" | |
| if not text_input or text_input.strip() == "": | |
| return None, "β Please enter the text to synthesize" | |
| # Initialize API Key | |
| init_dashscope_api_key() | |
| # Create voice clone | |
| status_msg = "π€ Creating voice clone..." | |
| print(status_msg) | |
| voice_id = create_voice(audio_file, audio_mime_type="audio/wav") | |
| # Initialize TTS | |
| status_msg = "π Synthesizing speech..." | |
| print(status_msg) | |
| callback = TTSCallback() | |
| qwen_tts_realtime = QwenTtsRealtime( | |
| model=DEFAULT_TARGET_MODEL, | |
| callback=callback, | |
| url='wss://dashscope.aliyuncs.com/api-ws/v1/realtime' | |
| ) | |
| qwen_tts_realtime.connect() | |
| # Update session configuration | |
| qwen_tts_realtime.update_session( | |
| voice=voice_id, | |
| response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, | |
| mode='server_commit' | |
| ) | |
| # Send text | |
| qwen_tts_realtime.append_text(text_input) | |
| qwen_tts_realtime.finish() | |
| # Wait for completion | |
| callback.wait_for_finished() | |
| if callback.error_msg: | |
| return None, f"β Synthesis failed: {callback.error_msg}" | |
| # Get audio data and save as WAV file | |
| audio_data = callback.get_audio_data() | |
| if not audio_data: | |
| return None, "β No audio data generated" | |
| # Create temporary file to save audio | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: | |
| output_path = tmp_file.name | |
| # Write WAV file header | |
| with wave.open(output_path, 'wb') as wav_file: | |
| wav_file.setnchannels(1) # Mono | |
| wav_file.setsampwidth(2) # 16bit | |
| wav_file.setframerate(24000) # 24kHz | |
| wav_file.writeframes(audio_data) | |
| success_msg = f"β Synthesis successful! Session ID: {qwen_tts_realtime.get_session_id()}" | |
| print(success_msg) | |
| return output_path, success_msg | |
| except Exception as e: | |
| error_msg = f"β An error occurred: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| # ======= Gradio Interface ======= | |
| def create_gradio_interface(): | |
| """Create Gradio interface""" | |
| with gr.Blocks(title="Qwen Voice Cloning and Synthesis", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # ποΈ Qwen Voice Cloning and Synthesis | |
| **Usage Steps:** | |
| 1. Click the microphone icon to record a voice sample (recommended 10-30 seconds, clear and natural) | |
| 2. Enter the text content to synthesize | |
| 3. Click the "Start Synthesis" button | |
| 4. Wait for synthesis to complete, then play or download the result | |
| **Notes:** | |
| - Please ensure the environment variable `DASHSCOPE_API_KEY` is set | |
| - Better recording quality leads to better synthesis results | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Step 1: Record Voice Sample") | |
| audio_input = gr.Audio( | |
| sources=["microphone"], | |
| type="filepath", | |
| label="Record Voice", | |
| format="wav" | |
| ) | |
| gr.Markdown("### Step 2: Enter Text to Synthesize") | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Please enter the text content to synthesize...", | |
| lines=5, | |
| value="Hello, this is a voice synthesized using voice cloning technology." | |
| ) | |
| submit_btn = gr.Button("π΅ Start Synthesis", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Synthesis Result") | |
| status_output = gr.Textbox( | |
| label="Status Information", | |
| interactive=False, | |
| lines=2 | |
| ) | |
| audio_output = gr.Audio( | |
| label="Synthesized Voice", | |
| type="filepath" | |
| ) | |
| # Bind events | |
| submit_btn.click( | |
| fn=synthesize_speech, | |
| inputs=[audio_input, text_input], | |
| outputs=[audio_output, status_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| π‘ **Tip:** For better results, please ensure a quiet recording environment and clear, natural pronunciation. | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| # Check API Key | |
| try: | |
| init_dashscope_api_key() | |
| print("β API Key verified successfully") | |
| except ValueError as e: | |
| print(f"β οΈ Warning: {e}") | |
| print("Please set the environment variable: export DASHSCOPE_API_KEY='your-api-key'") | |
| demo = create_gradio_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| ssr_mode=False | |
| ) |