Spaces:

Qwen
/

Qwen-TTS-Clone-Demo

Running

App Files Files Community

Qwen-TTS-Clone-Demo / app.py

littlebird13

Update app.py

6f10747 verified 15 days ago

raw

history blame contribute delete

10.2 kB

	import gradio as gr
	import os
	import requests
	import base64
	import pathlib
	import threading
	import tempfile
	from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat
	import dashscope
	import wave
	import numpy as np

	# ======= Constants Configuration =======
	DEFAULT_TARGET_MODEL = "qwen3-tts-vc-realtime-2025-11-27"
	DEFAULT_PREFERRED_NAME = "custom_voice"
	DEFAULT_AUDIO_MIME_TYPE = "audio/wav"

	def init_dashscope_api_key():
	"""Initialize the API key for dashscope SDK"""
	api_key = os.environ['API_KEY']
	if not api_key:
	raise ValueError("Please set the environment variable DASHSCOPE_API_KEY")
	dashscope.api_key = api_key
	return api_key

	def create_voice(file_path: str,
	target_model: str = DEFAULT_TARGET_MODEL,
	preferred_name: str = DEFAULT_PREFERRED_NAME,
	audio_mime_type: str = DEFAULT_AUDIO_MIME_TYPE) -> str:
	"""Create voice and return the voice parameter"""
	api_key = os.environ['API_KEY']

	file_path_obj = pathlib.Path(file_path)
	if not file_path_obj.exists():
	raise FileNotFoundError(f"Audio file not found: {file_path}")

	base64_str = base64.b64encode(file_path_obj.read_bytes()).decode()
	data_uri = f"data:{audio_mime_type};base64,{base64_str}"

	url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
	payload = {
	"model": "qwen-voice-enrollment",
	"input": {
	"action": "create",
	"target_model": target_model,
	"preferred_name": preferred_name,
	"audio": {"data": data_uri}
	}
	}
	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	}

	# Create session and configure retry and SSL
	session = requests.Session()
	# If SSL errors persist, temporarily change to False (for testing only)
	session.verify = True # Enable SSL verification

	# Configure retry strategy
	from requests.adapters import HTTPAdapter
	from urllib3.util.retry import Retry

	retry_strategy = Retry(
	total=3,
	backoff_factor=1,
	status_forcelist=[429, 500, 502, 503, 504],
	allowed_methods=["POST"]
	)
	adapter = HTTPAdapter(max_retries=retry_strategy)
	session.mount("https://", adapter)
	session.mount("http://", adapter)

	try:
	resp = session.post(url, json=payload, headers=headers, timeout=60)
	if resp.status_code != 200:
	raise RuntimeError(f"Failed to create voice: {resp.status_code}, {resp.text}")

	return resp.json()["output"]["voice"]
	except requests.exceptions.SSLError as e:
	raise RuntimeError(f"SSL connection error: {e}. Please check network environment or try using a proxy")
	except requests.exceptions.Timeout as e:
	raise RuntimeError(f"Request timeout: {e}")
	except (KeyError, ValueError) as e:
	raise RuntimeError(f"Failed to parse voice response: {e}")
	finally:
	session.close()

	class TTSCallback(QwenTtsRealtimeCallback):
	"""TTS streaming callback for collecting audio data"""
	def __init__(self):
	self.complete_event = threading.Event()
	self.audio_chunks = []
	self.error_msg = None

	def on_open(self) -> None:
	print('[TTS] Connection established')

	def on_close(self, close_status_code, close_msg) -> None:
	print(f'[TTS] Connection closed code={close_status_code}, msg={close_msg}')

	def on_event(self, response: dict) -> None:
	try:
	event_type = response.get('type', '')
	if event_type == 'session.created':
	print(f'[TTS] Session started: {response["session"]["id"]}')
	elif event_type == 'response.audio.delta':
	audio_data = base64.b64decode(response['delta'])
	self.audio_chunks.append(audio_data)
	elif event_type == 'response.done':
	print('[TTS] Response completed')
	elif event_type == 'session.finished':
	print('[TTS] Session finished')
	self.complete_event.set()
	except Exception as e:
	self.error_msg = str(e)
	print(f'[Error] Exception while processing callback event: {e}')
	self.complete_event.set()

	def wait_for_finished(self):
	self.complete_event.wait()

	def get_audio_data(self):
	"""Return the synthesized audio data"""
	return b''.join(self.audio_chunks)

	def synthesize_speech(audio_file, text_input):
	"""
	Main function for speech synthesis

	Args:
	audio_file: Path to the recorded audio file (from Gradio audio component)
	text_input: Text to synthesize

	Returns:
	Path to the synthesized audio file
	"""
	try:
	if not audio_file:
	return None, "❌ Please record a voice sample first"

	if not text_input or text_input.strip() == "":
	return None, "❌ Please enter the text to synthesize"

	# Initialize API Key
	init_dashscope_api_key()

	# Create voice clone
	status_msg = "🎤 Creating voice clone..."
	print(status_msg)
	voice_id = create_voice(audio_file, audio_mime_type="audio/wav")

	# Initialize TTS
	status_msg = "🔊 Synthesizing speech..."
	print(status_msg)
	callback = TTSCallback()
	qwen_tts_realtime = QwenTtsRealtime(
	model=DEFAULT_TARGET_MODEL,
	callback=callback,
	url='wss://dashscope.aliyuncs.com/api-ws/v1/realtime'
	)
	qwen_tts_realtime.connect()

	# Update session configuration
	qwen_tts_realtime.update_session(
	voice=voice_id,
	response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
	mode='server_commit'
	)

	# Send text
	qwen_tts_realtime.append_text(text_input)
	qwen_tts_realtime.finish()

	# Wait for completion
	callback.wait_for_finished()

	if callback.error_msg:
	return None, f"❌ Synthesis failed: {callback.error_msg}"

	# Get audio data and save as WAV file
	audio_data = callback.get_audio_data()

	if not audio_data:
	return None, "❌ No audio data generated"

	# Create temporary file to save audio
	with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
	output_path = tmp_file.name

	# Write WAV file header
	with wave.open(output_path, 'wb') as wav_file:
	wav_file.setnchannels(1) # Mono
	wav_file.setsampwidth(2) # 16bit
	wav_file.setframerate(24000) # 24kHz
	wav_file.writeframes(audio_data)

	success_msg = f"✅ Synthesis successful! Session ID: {qwen_tts_realtime.get_session_id()}"
	print(success_msg)
	return output_path, success_msg

	except Exception as e:
	error_msg = f"❌ An error occurred: {str(e)}"
	print(error_msg)
	return None, error_msg

	# ======= Gradio Interface =======
	def create_gradio_interface():
	"""Create Gradio interface"""

	with gr.Blocks(title="Qwen Voice Cloning and Synthesis", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎙️ Qwen Voice Cloning and Synthesis

	Usage Steps:
	1. Click the microphone icon to record a voice sample (recommended 10-30 seconds, clear and natural)
	2. Enter the text content to synthesize
	3. Click the "Start Synthesis" button
	4. Wait for synthesis to complete, then play or download the result

	Notes:
	- Please ensure the environment variable `DASHSCOPE_API_KEY` is set
	- Better recording quality leads to better synthesis results
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Step 1: Record Voice Sample")
	audio_input = gr.Audio(
	sources=["microphone"],
	type="filepath",
	label="Record Voice",
	format="wav"
	)

	gr.Markdown("### Step 2: Enter Text to Synthesize")
	text_input = gr.Textbox(
	label="Text to Synthesize",
	placeholder="Please enter the text content to synthesize...",
	lines=5,
	value="Hello, this is a voice synthesized using voice cloning technology."
	)

	submit_btn = gr.Button("🎵 Start Synthesis", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### Synthesis Result")
	status_output = gr.Textbox(
	label="Status Information",
	interactive=False,
	lines=2
	)
	audio_output = gr.Audio(
	label="Synthesized Voice",
	type="filepath"
	)

	# Bind events
	submit_btn.click(
	fn=synthesize_speech,
	inputs=[audio_input, text_input],
	outputs=[audio_output, status_output]
	)

	gr.Markdown("""
	---
	💡 Tip: For better results, please ensure a quiet recording environment and clear, natural pronunciation.
	""")

	return demo

	if __name__ == "__main__":
	# Check API Key
	try:
	init_dashscope_api_key()
	print("✅ API Key verified successfully")
	except ValueError as e:
	print(f"⚠️ Warning: {e}")
	print("Please set the environment variable: export DASHSCOPE_API_KEY='your-api-key'")

	demo = create_gradio_interface()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	ssr_mode=False
	)