Spaces:

Qwen
/

Qwen-TTS-Clone-Demo

Running

File size: 10,187 Bytes

import gradio as gr
import os
import requests
import base64
import pathlib
import threading
import tempfile
from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat
import dashscope
import wave
import numpy as np

# ======= Constants Configuration =======
DEFAULT_TARGET_MODEL = "qwen3-tts-vc-realtime-2025-11-27"
DEFAULT_PREFERRED_NAME = "custom_voice"
DEFAULT_AUDIO_MIME_TYPE = "audio/wav"

def init_dashscope_api_key():
    """Initialize the API key for dashscope SDK"""
    api_key = os.environ['API_KEY']
    if not api_key:
        raise ValueError("Please set the environment variable DASHSCOPE_API_KEY")
    dashscope.api_key = api_key
    return api_key

def create_voice(file_path: str,
                 target_model: str = DEFAULT_TARGET_MODEL,
                 preferred_name: str = DEFAULT_PREFERRED_NAME,
                 audio_mime_type: str = DEFAULT_AUDIO_MIME_TYPE) -> str:
    """Create voice and return the voice parameter"""
    api_key = os.environ['API_KEY']
    
    file_path_obj = pathlib.Path(file_path)
    if not file_path_obj.exists():
        raise FileNotFoundError(f"Audio file not found: {file_path}")
    
    base64_str = base64.b64encode(file_path_obj.read_bytes()).decode()
    data_uri = f"data:{audio_mime_type};base64,{base64_str}"
    
    url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
    payload = {
        "model": "qwen-voice-enrollment",
        "input": {
            "action": "create",
            "target_model": target_model,
            "preferred_name": preferred_name,
            "audio": {"data": data_uri}
        }
    }
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    # Create session and configure retry and SSL
    session = requests.Session()
    # If SSL errors persist, temporarily change to False (for testing only)
    session.verify = True  # Enable SSL verification
    
    # Configure retry strategy
    from requests.adapters import HTTPAdapter
    from urllib3.util.retry import Retry
    
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["POST"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    
    try:
        resp = session.post(url, json=payload, headers=headers, timeout=60)
        if resp.status_code != 200:
            raise RuntimeError(f"Failed to create voice: {resp.status_code}, {resp.text}")
        
        return resp.json()["output"]["voice"]
    except requests.exceptions.SSLError as e:
        raise RuntimeError(f"SSL connection error: {e}. Please check network environment or try using a proxy")
    except requests.exceptions.Timeout as e:
        raise RuntimeError(f"Request timeout: {e}")
    except (KeyError, ValueError) as e:
        raise RuntimeError(f"Failed to parse voice response: {e}")
    finally:
        session.close()

class TTSCallback(QwenTtsRealtimeCallback):
    """TTS streaming callback for collecting audio data"""
    def __init__(self):
        self.complete_event = threading.Event()
        self.audio_chunks = []
        self.error_msg = None
    
    def on_open(self) -> None:
        print('[TTS] Connection established')
    
    def on_close(self, close_status_code, close_msg) -> None:
        print(f'[TTS] Connection closed code={close_status_code}, msg={close_msg}')
    
    def on_event(self, response: dict) -> None:
        try:
            event_type = response.get('type', '')
            if event_type == 'session.created':
                print(f'[TTS] Session started: {response["session"]["id"]}')
            elif event_type == 'response.audio.delta':
                audio_data = base64.b64decode(response['delta'])
                self.audio_chunks.append(audio_data)
            elif event_type == 'response.done':
                print('[TTS] Response completed')
            elif event_type == 'session.finished':
                print('[TTS] Session finished')
                self.complete_event.set()
        except Exception as e:
            self.error_msg = str(e)
            print(f'[Error] Exception while processing callback event: {e}')
            self.complete_event.set()
    
    def wait_for_finished(self):
        self.complete_event.wait()
    
    def get_audio_data(self):
        """Return the synthesized audio data"""
        return b''.join(self.audio_chunks)

def synthesize_speech(audio_file, text_input):
    """
    Main function for speech synthesis
    
    Args:
        audio_file: Path to the recorded audio file (from Gradio audio component)
        text_input: Text to synthesize
    
    Returns:
        Path to the synthesized audio file
    """
    try:
        if not audio_file:
            return None, "❌ Please record a voice sample first"
        
        if not text_input or text_input.strip() == "":
            return None, "❌ Please enter the text to synthesize"
        
        # Initialize API Key
        init_dashscope_api_key()
        
        # Create voice clone
        status_msg = "🎤 Creating voice clone..."
        print(status_msg)
        voice_id = create_voice(audio_file, audio_mime_type="audio/wav")
        
        # Initialize TTS
        status_msg = "🔊 Synthesizing speech..."
        print(status_msg)
        callback = TTSCallback()
        qwen_tts_realtime = QwenTtsRealtime(
            model=DEFAULT_TARGET_MODEL,
            callback=callback,
            url='wss://dashscope.aliyuncs.com/api-ws/v1/realtime'
        )
        qwen_tts_realtime.connect()
        
        # Update session configuration
        qwen_tts_realtime.update_session(
            voice=voice_id,
            response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
            mode='server_commit'
        )
        
        # Send text
        qwen_tts_realtime.append_text(text_input)
        qwen_tts_realtime.finish()
        
        # Wait for completion
        callback.wait_for_finished()
        
        if callback.error_msg:
            return None, f"❌ Synthesis failed: {callback.error_msg}"
        
        # Get audio data and save as WAV file
        audio_data = callback.get_audio_data()
        
        if not audio_data:
            return None, "❌ No audio data generated"
        
        # Create temporary file to save audio
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
            output_path = tmp_file.name
            
            # Write WAV file header
            with wave.open(output_path, 'wb') as wav_file:
                wav_file.setnchannels(1)  # Mono
                wav_file.setsampwidth(2)  # 16bit
                wav_file.setframerate(24000)  # 24kHz
                wav_file.writeframes(audio_data)
        
        success_msg = f"✅ Synthesis successful! Session ID: {qwen_tts_realtime.get_session_id()}"
        print(success_msg)
        return output_path, success_msg
        
    except Exception as e:
        error_msg = f"❌ An error occurred: {str(e)}"
        print(error_msg)
        return None, error_msg

# ======= Gradio Interface =======
def create_gradio_interface():
    """Create Gradio interface"""
    
    with gr.Blocks(title="Qwen Voice Cloning and Synthesis", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🎙️ Qwen Voice Cloning and Synthesis
        
        **Usage Steps:**
        1. Click the microphone icon to record a voice sample (recommended 10-30 seconds, clear and natural)
        2. Enter the text content to synthesize
        3. Click the "Start Synthesis" button
        4. Wait for synthesis to complete, then play or download the result
        
        **Notes:**
        - Please ensure the environment variable `DASHSCOPE_API_KEY` is set
        - Better recording quality leads to better synthesis results
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### Step 1: Record Voice Sample")
                audio_input = gr.Audio(
                    sources=["microphone"],
                    type="filepath",
                    label="Record Voice",
                    format="wav"
                )
                
                gr.Markdown("### Step 2: Enter Text to Synthesize")
                text_input = gr.Textbox(
                    label="Text to Synthesize",
                    placeholder="Please enter the text content to synthesize...",
                    lines=5,
                    value="Hello, this is a voice synthesized using voice cloning technology."
                )
                
                submit_btn = gr.Button("🎵 Start Synthesis", variant="primary", size="lg")
            
            with gr.Column(scale=1):
                gr.Markdown("### Synthesis Result")
                status_output = gr.Textbox(
                    label="Status Information",
                    interactive=False,
                    lines=2
                )
                audio_output = gr.Audio(
                    label="Synthesized Voice",
                    type="filepath"
                )
        
        # Bind events
        submit_btn.click(
            fn=synthesize_speech,
            inputs=[audio_input, text_input],
            outputs=[audio_output, status_output]
        )
        
        gr.Markdown("""
        ---
        💡 **Tip:** For better results, please ensure a quiet recording environment and clear, natural pronunciation.
        """)
    
    return demo

if __name__ == "__main__":
    # Check API Key
    try:
        init_dashscope_api_key()
        print("✅ API Key verified successfully")
    except ValueError as e:
        print(f"⚠️  Warning: {e}")
        print("Please set the environment variable: export DASHSCOPE_API_KEY='your-api-key'")
    
    demo = create_gradio_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        ssr_mode=False
    )