Spaces:
Running
Running
File size: 10,187 Bytes
04fb7a1 6f10747 04fb7a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 |
import gradio as gr
import os
import requests
import base64
import pathlib
import threading
import tempfile
from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat
import dashscope
import wave
import numpy as np
# ======= Constants Configuration =======
DEFAULT_TARGET_MODEL = "qwen3-tts-vc-realtime-2025-11-27"
DEFAULT_PREFERRED_NAME = "custom_voice"
DEFAULT_AUDIO_MIME_TYPE = "audio/wav"
def init_dashscope_api_key():
"""Initialize the API key for dashscope SDK"""
api_key = os.environ['API_KEY']
if not api_key:
raise ValueError("Please set the environment variable DASHSCOPE_API_KEY")
dashscope.api_key = api_key
return api_key
def create_voice(file_path: str,
target_model: str = DEFAULT_TARGET_MODEL,
preferred_name: str = DEFAULT_PREFERRED_NAME,
audio_mime_type: str = DEFAULT_AUDIO_MIME_TYPE) -> str:
"""Create voice and return the voice parameter"""
api_key = os.environ['API_KEY']
file_path_obj = pathlib.Path(file_path)
if not file_path_obj.exists():
raise FileNotFoundError(f"Audio file not found: {file_path}")
base64_str = base64.b64encode(file_path_obj.read_bytes()).decode()
data_uri = f"data:{audio_mime_type};base64,{base64_str}"
url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
payload = {
"model": "qwen-voice-enrollment",
"input": {
"action": "create",
"target_model": target_model,
"preferred_name": preferred_name,
"audio": {"data": data_uri}
}
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
# Create session and configure retry and SSL
session = requests.Session()
# If SSL errors persist, temporarily change to False (for testing only)
session.verify = True # Enable SSL verification
# Configure retry strategy
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["POST"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)
try:
resp = session.post(url, json=payload, headers=headers, timeout=60)
if resp.status_code != 200:
raise RuntimeError(f"Failed to create voice: {resp.status_code}, {resp.text}")
return resp.json()["output"]["voice"]
except requests.exceptions.SSLError as e:
raise RuntimeError(f"SSL connection error: {e}. Please check network environment or try using a proxy")
except requests.exceptions.Timeout as e:
raise RuntimeError(f"Request timeout: {e}")
except (KeyError, ValueError) as e:
raise RuntimeError(f"Failed to parse voice response: {e}")
finally:
session.close()
class TTSCallback(QwenTtsRealtimeCallback):
"""TTS streaming callback for collecting audio data"""
def __init__(self):
self.complete_event = threading.Event()
self.audio_chunks = []
self.error_msg = None
def on_open(self) -> None:
print('[TTS] Connection established')
def on_close(self, close_status_code, close_msg) -> None:
print(f'[TTS] Connection closed code={close_status_code}, msg={close_msg}')
def on_event(self, response: dict) -> None:
try:
event_type = response.get('type', '')
if event_type == 'session.created':
print(f'[TTS] Session started: {response["session"]["id"]}')
elif event_type == 'response.audio.delta':
audio_data = base64.b64decode(response['delta'])
self.audio_chunks.append(audio_data)
elif event_type == 'response.done':
print('[TTS] Response completed')
elif event_type == 'session.finished':
print('[TTS] Session finished')
self.complete_event.set()
except Exception as e:
self.error_msg = str(e)
print(f'[Error] Exception while processing callback event: {e}')
self.complete_event.set()
def wait_for_finished(self):
self.complete_event.wait()
def get_audio_data(self):
"""Return the synthesized audio data"""
return b''.join(self.audio_chunks)
def synthesize_speech(audio_file, text_input):
"""
Main function for speech synthesis
Args:
audio_file: Path to the recorded audio file (from Gradio audio component)
text_input: Text to synthesize
Returns:
Path to the synthesized audio file
"""
try:
if not audio_file:
return None, "β Please record a voice sample first"
if not text_input or text_input.strip() == "":
return None, "β Please enter the text to synthesize"
# Initialize API Key
init_dashscope_api_key()
# Create voice clone
status_msg = "π€ Creating voice clone..."
print(status_msg)
voice_id = create_voice(audio_file, audio_mime_type="audio/wav")
# Initialize TTS
status_msg = "π Synthesizing speech..."
print(status_msg)
callback = TTSCallback()
qwen_tts_realtime = QwenTtsRealtime(
model=DEFAULT_TARGET_MODEL,
callback=callback,
url='wss://dashscope.aliyuncs.com/api-ws/v1/realtime'
)
qwen_tts_realtime.connect()
# Update session configuration
qwen_tts_realtime.update_session(
voice=voice_id,
response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
mode='server_commit'
)
# Send text
qwen_tts_realtime.append_text(text_input)
qwen_tts_realtime.finish()
# Wait for completion
callback.wait_for_finished()
if callback.error_msg:
return None, f"β Synthesis failed: {callback.error_msg}"
# Get audio data and save as WAV file
audio_data = callback.get_audio_data()
if not audio_data:
return None, "β No audio data generated"
# Create temporary file to save audio
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
output_path = tmp_file.name
# Write WAV file header
with wave.open(output_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16bit
wav_file.setframerate(24000) # 24kHz
wav_file.writeframes(audio_data)
success_msg = f"β
Synthesis successful! Session ID: {qwen_tts_realtime.get_session_id()}"
print(success_msg)
return output_path, success_msg
except Exception as e:
error_msg = f"β An error occurred: {str(e)}"
print(error_msg)
return None, error_msg
# ======= Gradio Interface =======
def create_gradio_interface():
"""Create Gradio interface"""
with gr.Blocks(title="Qwen Voice Cloning and Synthesis", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# ποΈ Qwen Voice Cloning and Synthesis
**Usage Steps:**
1. Click the microphone icon to record a voice sample (recommended 10-30 seconds, clear and natural)
2. Enter the text content to synthesize
3. Click the "Start Synthesis" button
4. Wait for synthesis to complete, then play or download the result
**Notes:**
- Please ensure the environment variable `DASHSCOPE_API_KEY` is set
- Better recording quality leads to better synthesis results
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Step 1: Record Voice Sample")
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record Voice",
format="wav"
)
gr.Markdown("### Step 2: Enter Text to Synthesize")
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Please enter the text content to synthesize...",
lines=5,
value="Hello, this is a voice synthesized using voice cloning technology."
)
submit_btn = gr.Button("π΅ Start Synthesis", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Synthesis Result")
status_output = gr.Textbox(
label="Status Information",
interactive=False,
lines=2
)
audio_output = gr.Audio(
label="Synthesized Voice",
type="filepath"
)
# Bind events
submit_btn.click(
fn=synthesize_speech,
inputs=[audio_input, text_input],
outputs=[audio_output, status_output]
)
gr.Markdown("""
---
π‘ **Tip:** For better results, please ensure a quiet recording environment and clear, natural pronunciation.
""")
return demo
if __name__ == "__main__":
# Check API Key
try:
init_dashscope_api_key()
print("β
API Key verified successfully")
except ValueError as e:
print(f"β οΈ Warning: {e}")
print("Please set the environment variable: export DASHSCOPE_API_KEY='your-api-key'")
demo = create_gradio_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
ssr_mode=False
) |