EchoX / tts_wrapper.py
tzzte's picture
Upload 4 files
4ff5a32 verified
raw
history blame
1.68 kB
import sys
import os
import torch
import torchaudio
sys.path.insert(0, './CosyVoice')
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
class CosyVoice2TTS:
def __init__(self, model_dir, device="cuda"):
print(f"[TTS] Loading CosyVoice2 model from {model_dir}...")
# εˆε§‹εŒ–ζ¨‘εž‹
self.model = CosyVoice2(
model_dir,
load_jit=False,
load_trt=False,
load_vllm=False,
fp16=True
)
print("[TTS] CosyVoice2 Model loaded successfully.")
def synthesize(self, text, prompt_text, prompt_speech_path, output_path=None, stream=False):
if not text:
return None, None
# εŠ θ½½ιŸ³ι’‘
prompt_speech_16k = load_wav(prompt_speech_path, 16000)
# 调用 zero_shot ζŽ¨η†
output = self.model.inference_zero_shot(
tts_text=text,
prompt_text=prompt_text,
prompt_speech_16k=prompt_speech_16k,
stream=stream
)
final_audio = []
# θŽ·ε–ι‡‡ζ ·ηŽ‡
sample_rate = getattr(self.model, 'sample_rate', 24000)
for i in output:
final_audio.append(i['tts_speech'])
if not final_audio:
return None, None
full_audio_tensor = torch.cat(final_audio, dim=1)
if output_path:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
torchaudio.save(output_path, full_audio_tensor, sample_rate)
print(f"[TTS] Audio saved to {output_path}")
return sample_rate, full_audio_tensor.cpu().numpy()