|
|
|
|
|
|
|
|
|
|
|
|
|
|
use clap::{Parser, Subcommand}; |
|
|
use indextts::{ |
|
|
pipeline::{IndexTTS, SynthesisOptions}, |
|
|
Config, Result, |
|
|
}; |
|
|
use std::path::PathBuf; |
|
|
|
|
|
#[derive(Parser)] |
|
|
#[command( |
|
|
name = "indextts", |
|
|
about = "High-performance Text-to-Speech engine in Rust", |
|
|
version, |
|
|
author |
|
|
)] |
|
|
struct Cli { |
|
|
#[command(subcommand)] |
|
|
command: Commands, |
|
|
} |
|
|
|
|
|
#[derive(Subcommand)] |
|
|
enum Commands { |
|
|
|
|
|
Synthesize { |
|
|
|
|
|
#[arg(short, long)] |
|
|
text: String, |
|
|
|
|
|
|
|
|
#[arg(short = 'v', long)] |
|
|
voice: PathBuf, |
|
|
|
|
|
|
|
|
#[arg(short, long, default_value = "output.wav")] |
|
|
output: PathBuf, |
|
|
|
|
|
|
|
|
#[arg(short, long)] |
|
|
config: Option<PathBuf>, |
|
|
|
|
|
|
|
|
#[arg(short, long, default_value = "models")] |
|
|
model_dir: PathBuf, |
|
|
|
|
|
|
|
|
#[arg(long)] |
|
|
emotion: Option<String>, |
|
|
|
|
|
|
|
|
#[arg(long, default_value = "1.0")] |
|
|
emotion_alpha: f32, |
|
|
|
|
|
|
|
|
#[arg(long, default_value = "50")] |
|
|
top_k: usize, |
|
|
|
|
|
|
|
|
#[arg(long, default_value = "0.95")] |
|
|
top_p: f32, |
|
|
|
|
|
|
|
|
#[arg(long, default_value = "1.1")] |
|
|
repetition_penalty: f32, |
|
|
|
|
|
|
|
|
#[arg(long)] |
|
|
fp16: bool, |
|
|
|
|
|
|
|
|
#[arg(short, long, default_value = "cpu")] |
|
|
device: String, |
|
|
}, |
|
|
|
|
|
|
|
|
SynthesizeFile { |
|
|
|
|
|
#[arg(short, long)] |
|
|
input: PathBuf, |
|
|
|
|
|
|
|
|
#[arg(short = 'v', long)] |
|
|
voice: PathBuf, |
|
|
|
|
|
|
|
|
#[arg(short, long, default_value = "output.wav")] |
|
|
output: PathBuf, |
|
|
|
|
|
|
|
|
#[arg(short, long)] |
|
|
config: Option<PathBuf>, |
|
|
|
|
|
|
|
|
#[arg(short, long, default_value = "models")] |
|
|
model_dir: PathBuf, |
|
|
|
|
|
|
|
|
#[arg(long, default_value = "200")] |
|
|
silence_ms: u32, |
|
|
}, |
|
|
|
|
|
|
|
|
InitConfig { |
|
|
|
|
|
#[arg(short, long, default_value = "config.yaml")] |
|
|
output: PathBuf, |
|
|
}, |
|
|
|
|
|
|
|
|
Info, |
|
|
|
|
|
|
|
|
Benchmark { |
|
|
|
|
|
#[arg(short, long, default_value = "10")] |
|
|
iterations: usize, |
|
|
}, |
|
|
} |
|
|
|
|
|
fn main() -> Result<()> { |
|
|
|
|
|
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); |
|
|
|
|
|
let cli = Cli::parse(); |
|
|
|
|
|
match cli.command { |
|
|
Commands::Synthesize { |
|
|
text, |
|
|
voice, |
|
|
output, |
|
|
config, |
|
|
model_dir, |
|
|
emotion, |
|
|
emotion_alpha, |
|
|
top_k, |
|
|
top_p, |
|
|
repetition_penalty, |
|
|
fp16: _, |
|
|
device: _, |
|
|
} => { |
|
|
log::info!("IndexTTS Synthesizer"); |
|
|
log::info!("===================="); |
|
|
|
|
|
|
|
|
let cfg = if let Some(config_path) = config { |
|
|
Config::load(config_path)? |
|
|
} else { |
|
|
let mut cfg = Config::default(); |
|
|
cfg.model_dir = model_dir; |
|
|
cfg |
|
|
}; |
|
|
|
|
|
|
|
|
let tts = IndexTTS::new(cfg)?; |
|
|
|
|
|
|
|
|
let emotion_vec = emotion.map(|s| { |
|
|
s.split(',') |
|
|
.filter_map(|v| v.trim().parse::<f32>().ok()) |
|
|
.collect::<Vec<f32>>() |
|
|
}); |
|
|
|
|
|
|
|
|
let options = SynthesisOptions { |
|
|
emotion_vector: emotion_vec, |
|
|
emotion_alpha, |
|
|
sampling: indextts::model::SamplingStrategy::TopKP { k: top_k, p: top_p }, |
|
|
repetition_penalty, |
|
|
..Default::default() |
|
|
}; |
|
|
|
|
|
|
|
|
log::info!("Text: {}", &text[..text.len().min(100)]); |
|
|
log::info!("Voice: {}", voice.display()); |
|
|
log::info!("Output: {}", output.display()); |
|
|
|
|
|
let result = tts.synthesize_to_file( |
|
|
&text, |
|
|
voice.to_str().unwrap(), |
|
|
output.to_str().unwrap(), |
|
|
&options, |
|
|
)?; |
|
|
|
|
|
log::info!("Duration: {}", result.duration_formatted()); |
|
|
log::info!("Processing time: {:.2}s", result.processing_time); |
|
|
log::info!("Real-time factor: {:.3}x", result.rtf); |
|
|
|
|
|
println!("✓ Synthesis complete: {}", output.display()); |
|
|
} |
|
|
|
|
|
Commands::SynthesizeFile { |
|
|
input, |
|
|
voice, |
|
|
output, |
|
|
config, |
|
|
model_dir, |
|
|
silence_ms, |
|
|
} => { |
|
|
log::info!("IndexTTS File Synthesizer"); |
|
|
log::info!("=========================="); |
|
|
|
|
|
|
|
|
let text = std::fs::read_to_string(&input)?; |
|
|
|
|
|
|
|
|
let cfg = if let Some(config_path) = config { |
|
|
Config::load(config_path)? |
|
|
} else { |
|
|
let mut cfg = Config::default(); |
|
|
cfg.model_dir = model_dir; |
|
|
cfg |
|
|
}; |
|
|
|
|
|
|
|
|
let tts = IndexTTS::new(cfg)?; |
|
|
|
|
|
|
|
|
let options = SynthesisOptions { |
|
|
segment_silence_ms: silence_ms, |
|
|
..Default::default() |
|
|
}; |
|
|
|
|
|
|
|
|
log::info!("Input file: {}", input.display()); |
|
|
log::info!("Text length: {} characters", text.len()); |
|
|
|
|
|
let result = tts.synthesize_long( |
|
|
&text, |
|
|
voice.to_str().unwrap(), |
|
|
&options, |
|
|
)?; |
|
|
|
|
|
result.save(&output)?; |
|
|
|
|
|
log::info!("Duration: {}", result.duration_formatted()); |
|
|
log::info!("Processing time: {:.2}s", result.processing_time); |
|
|
log::info!("Real-time factor: {:.3}x", result.rtf); |
|
|
|
|
|
println!("✓ Synthesis complete: {}", output.display()); |
|
|
} |
|
|
|
|
|
Commands::InitConfig { output } => { |
|
|
log::info!("Creating default configuration..."); |
|
|
|
|
|
let config = Config::default(); |
|
|
config.save(&output)?; |
|
|
|
|
|
println!("✓ Configuration saved to: {}", output.display()); |
|
|
} |
|
|
|
|
|
Commands::Info => { |
|
|
println!("IndexTTS - High-performance Text-to-Speech Engine"); |
|
|
println!("=================================================="); |
|
|
println!("Version: {}", indextts::VERSION); |
|
|
println!("Platform: {}", std::env::consts::OS); |
|
|
println!("Architecture: {}", std::env::consts::ARCH); |
|
|
println!(); |
|
|
println!("Features:"); |
|
|
println!(" - Multi-language support (Chinese, English, mixed)"); |
|
|
println!(" - Zero-shot voice cloning"); |
|
|
println!(" - 8-dimensional emotion control"); |
|
|
println!(" - High-quality neural vocoding (BigVGAN)"); |
|
|
println!(" - SIMD-optimized audio processing"); |
|
|
println!(" - Parallel processing with Rayon"); |
|
|
println!(); |
|
|
println!("Sample Rate: {} Hz", indextts::SAMPLE_RATE); |
|
|
println!("Mel Bands: {}", indextts::N_MELS); |
|
|
println!("FFT Size: {}", indextts::N_FFT); |
|
|
println!("Hop Length: {}", indextts::HOP_LENGTH); |
|
|
println!(); |
|
|
println!("CPU Cores: {}", num_cpus::get()); |
|
|
println!("Physical Cores: {}", num_cpus::get_physical()); |
|
|
} |
|
|
|
|
|
Commands::Benchmark { iterations } => { |
|
|
log::info!("Running benchmarks ({} iterations)...", iterations); |
|
|
|
|
|
|
|
|
benchmark_mel_spectrogram(iterations); |
|
|
|
|
|
|
|
|
benchmark_tokenization(iterations); |
|
|
|
|
|
|
|
|
benchmark_vocoder(iterations); |
|
|
|
|
|
println!("✓ Benchmarks complete"); |
|
|
} |
|
|
} |
|
|
|
|
|
Ok(()) |
|
|
} |
|
|
|
|
|
fn benchmark_mel_spectrogram(iterations: usize) { |
|
|
use indextts::audio::{mel_spectrogram, AudioConfig}; |
|
|
use std::time::Instant; |
|
|
|
|
|
println!("\nMel-Spectrogram Benchmark"); |
|
|
println!("-------------------------"); |
|
|
|
|
|
let config = AudioConfig::default(); |
|
|
let num_samples = config.sample_rate as usize; |
|
|
let signal: Vec<f32> = (0..num_samples) |
|
|
.map(|i| (i as f32 * 0.01).sin()) |
|
|
.collect(); |
|
|
|
|
|
let start = Instant::now(); |
|
|
for _ in 0..iterations { |
|
|
let _ = mel_spectrogram(&signal, &config); |
|
|
} |
|
|
let elapsed = start.elapsed(); |
|
|
|
|
|
let per_iter = elapsed.as_secs_f32() / iterations as f32; |
|
|
println!(" Signal length: {} samples ({:.2}s)", num_samples, num_samples as f32 / config.sample_rate as f32); |
|
|
println!(" Iterations: {}", iterations); |
|
|
println!(" Total time: {:.3}s", elapsed.as_secs_f32()); |
|
|
println!(" Per iteration: {:.3}ms", per_iter * 1000.0); |
|
|
println!(" Throughput: {:.1}x real-time", 1.0 / per_iter); |
|
|
} |
|
|
|
|
|
fn benchmark_tokenization(iterations: usize) { |
|
|
use indextts::text::{TextNormalizer, TextTokenizer, TokenizerConfig}; |
|
|
use std::time::Instant; |
|
|
|
|
|
println!("\nTokenization Benchmark"); |
|
|
println!("----------------------"); |
|
|
|
|
|
let normalizer = TextNormalizer::new(); |
|
|
let tokenizer = TextTokenizer::new(TokenizerConfig::default()).unwrap(); |
|
|
|
|
|
let test_texts = vec![ |
|
|
"Hello world, this is a test of the text-to-speech system.", |
|
|
"The quick brown fox jumps over the lazy dog.", |
|
|
"你好世界,这是一个测试。", |
|
|
"Mixed language: Hello 世界 and 你好 world.", |
|
|
]; |
|
|
|
|
|
let start = Instant::now(); |
|
|
for _ in 0..iterations { |
|
|
for text in &test_texts { |
|
|
let normalized = normalizer.normalize(text).unwrap(); |
|
|
let _tokens = tokenizer.encode(&normalized).unwrap(); |
|
|
} |
|
|
} |
|
|
let elapsed = start.elapsed(); |
|
|
|
|
|
let total_chars: usize = test_texts.iter().map(|t| t.len()).sum(); |
|
|
let per_iter = elapsed.as_secs_f32() / iterations as f32; |
|
|
println!(" Texts: {}", test_texts.len()); |
|
|
println!(" Total characters: {}", total_chars); |
|
|
println!(" Iterations: {}", iterations); |
|
|
println!(" Total time: {:.3}s", elapsed.as_secs_f32()); |
|
|
println!(" Per iteration: {:.3}ms", per_iter * 1000.0); |
|
|
println!( |
|
|
" Throughput: {:.0} chars/sec", |
|
|
(total_chars * iterations) as f32 / elapsed.as_secs_f32() |
|
|
); |
|
|
} |
|
|
|
|
|
fn benchmark_vocoder(iterations: usize) { |
|
|
use indextts::vocoder::{create_bigvgan_22k, Vocoder}; |
|
|
use ndarray::Array2; |
|
|
use std::time::Instant; |
|
|
|
|
|
println!("\nVocoder Benchmark"); |
|
|
println!("-----------------"); |
|
|
|
|
|
let vocoder = create_bigvgan_22k(); |
|
|
let num_frames = 100; |
|
|
let mel = Array2::zeros((80, num_frames)); |
|
|
|
|
|
let start = Instant::now(); |
|
|
for _ in 0..iterations { |
|
|
let _ = vocoder.synthesize(&mel); |
|
|
} |
|
|
let elapsed = start.elapsed(); |
|
|
|
|
|
let audio_duration = num_frames as f32 * vocoder.hop_length() as f32 / vocoder.sample_rate() as f32; |
|
|
let per_iter = elapsed.as_secs_f32() / iterations as f32; |
|
|
println!(" Mel frames: {}", num_frames); |
|
|
println!(" Audio duration: {:.2}s", audio_duration); |
|
|
println!(" Iterations: {}", iterations); |
|
|
println!(" Total time: {:.3}s", elapsed.as_secs_f32()); |
|
|
println!(" Per iteration: {:.3}ms", per_iter * 1000.0); |
|
|
println!(" RTF: {:.3}x", per_iter / audio_duration); |
|
|
} |
|
|
|