IndexTTS-Rust / src /main.rs
Claude
Convert IndexTTS to pure Rust implementation
2bbfbb7 unverified
//! IndexTTS CLI - High-performance Text-to-Speech in Rust
//!
//! Command-line interface for IndexTTS synthesizer
use clap::{Parser, Subcommand};
use indextts::{
pipeline::{IndexTTS, SynthesisOptions},
Config, Result,
};
use std::path::PathBuf;
#[derive(Parser)]
#[command(
name = "indextts",
about = "High-performance Text-to-Speech engine in Rust",
version,
author
)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// Synthesize speech from text
Synthesize {
/// Text to synthesize
#[arg(short, long)]
text: String,
/// Speaker reference audio file
#[arg(short = 'v', long)]
voice: PathBuf,
/// Output audio file path
#[arg(short, long, default_value = "output.wav")]
output: PathBuf,
/// Configuration file path
#[arg(short, long)]
config: Option<PathBuf>,
/// Model directory
#[arg(short, long, default_value = "models")]
model_dir: PathBuf,
/// Emotion vector (comma-separated, 8 values 0-1)
#[arg(long)]
emotion: Option<String>,
/// Emotion strength (0-1)
#[arg(long, default_value = "1.0")]
emotion_alpha: f32,
/// Top-k sampling parameter
#[arg(long, default_value = "50")]
top_k: usize,
/// Top-p sampling parameter
#[arg(long, default_value = "0.95")]
top_p: f32,
/// Repetition penalty
#[arg(long, default_value = "1.1")]
repetition_penalty: f32,
/// Use FP16 inference
#[arg(long)]
fp16: bool,
/// Device (cpu, cuda:0, etc.)
#[arg(short, long, default_value = "cpu")]
device: String,
},
/// Synthesize from a text file
SynthesizeFile {
/// Input text file
#[arg(short, long)]
input: PathBuf,
/// Speaker reference audio file
#[arg(short = 'v', long)]
voice: PathBuf,
/// Output audio file path
#[arg(short, long, default_value = "output.wav")]
output: PathBuf,
/// Configuration file path
#[arg(short, long)]
config: Option<PathBuf>,
/// Model directory
#[arg(short, long, default_value = "models")]
model_dir: PathBuf,
/// Silence between segments (milliseconds)
#[arg(long, default_value = "200")]
silence_ms: u32,
},
/// Generate default configuration file
InitConfig {
/// Output path for config file
#[arg(short, long, default_value = "config.yaml")]
output: PathBuf,
},
/// Show information about the system
Info,
/// Run benchmarks
Benchmark {
/// Number of iterations
#[arg(short, long, default_value = "10")]
iterations: usize,
},
}
fn main() -> Result<()> {
// Initialize logger
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
let cli = Cli::parse();
match cli.command {
Commands::Synthesize {
text,
voice,
output,
config,
model_dir,
emotion,
emotion_alpha,
top_k,
top_p,
repetition_penalty,
fp16: _,
device: _,
} => {
log::info!("IndexTTS Synthesizer");
log::info!("====================");
// Load or create config
let cfg = if let Some(config_path) = config {
Config::load(config_path)?
} else {
let mut cfg = Config::default();
cfg.model_dir = model_dir;
cfg
};
// Create TTS instance
let tts = IndexTTS::new(cfg)?;
// Parse emotion vector
let emotion_vec = emotion.map(|s| {
s.split(',')
.filter_map(|v| v.trim().parse::<f32>().ok())
.collect::<Vec<f32>>()
});
// Create synthesis options
let options = SynthesisOptions {
emotion_vector: emotion_vec,
emotion_alpha,
sampling: indextts::model::SamplingStrategy::TopKP { k: top_k, p: top_p },
repetition_penalty,
..Default::default()
};
// Synthesize
log::info!("Text: {}", &text[..text.len().min(100)]);
log::info!("Voice: {}", voice.display());
log::info!("Output: {}", output.display());
let result = tts.synthesize_to_file(
&text,
voice.to_str().unwrap(),
output.to_str().unwrap(),
&options,
)?;
log::info!("Duration: {}", result.duration_formatted());
log::info!("Processing time: {:.2}s", result.processing_time);
log::info!("Real-time factor: {:.3}x", result.rtf);
println!("✓ Synthesis complete: {}", output.display());
}
Commands::SynthesizeFile {
input,
voice,
output,
config,
model_dir,
silence_ms,
} => {
log::info!("IndexTTS File Synthesizer");
log::info!("==========================");
// Read text file
let text = std::fs::read_to_string(&input)?;
// Load or create config
let cfg = if let Some(config_path) = config {
Config::load(config_path)?
} else {
let mut cfg = Config::default();
cfg.model_dir = model_dir;
cfg
};
// Create TTS instance
let tts = IndexTTS::new(cfg)?;
// Create synthesis options
let options = SynthesisOptions {
segment_silence_ms: silence_ms,
..Default::default()
};
// Synthesize
log::info!("Input file: {}", input.display());
log::info!("Text length: {} characters", text.len());
let result = tts.synthesize_long(
&text,
voice.to_str().unwrap(),
&options,
)?;
result.save(&output)?;
log::info!("Duration: {}", result.duration_formatted());
log::info!("Processing time: {:.2}s", result.processing_time);
log::info!("Real-time factor: {:.3}x", result.rtf);
println!("✓ Synthesis complete: {}", output.display());
}
Commands::InitConfig { output } => {
log::info!("Creating default configuration...");
let config = Config::default();
config.save(&output)?;
println!("✓ Configuration saved to: {}", output.display());
}
Commands::Info => {
println!("IndexTTS - High-performance Text-to-Speech Engine");
println!("==================================================");
println!("Version: {}", indextts::VERSION);
println!("Platform: {}", std::env::consts::OS);
println!("Architecture: {}", std::env::consts::ARCH);
println!();
println!("Features:");
println!(" - Multi-language support (Chinese, English, mixed)");
println!(" - Zero-shot voice cloning");
println!(" - 8-dimensional emotion control");
println!(" - High-quality neural vocoding (BigVGAN)");
println!(" - SIMD-optimized audio processing");
println!(" - Parallel processing with Rayon");
println!();
println!("Sample Rate: {} Hz", indextts::SAMPLE_RATE);
println!("Mel Bands: {}", indextts::N_MELS);
println!("FFT Size: {}", indextts::N_FFT);
println!("Hop Length: {}", indextts::HOP_LENGTH);
println!();
println!("CPU Cores: {}", num_cpus::get());
println!("Physical Cores: {}", num_cpus::get_physical());
}
Commands::Benchmark { iterations } => {
log::info!("Running benchmarks ({} iterations)...", iterations);
// Benchmark mel-spectrogram computation
benchmark_mel_spectrogram(iterations);
// Benchmark tokenization
benchmark_tokenization(iterations);
// Benchmark vocoder
benchmark_vocoder(iterations);
println!("✓ Benchmarks complete");
}
}
Ok(())
}
fn benchmark_mel_spectrogram(iterations: usize) {
use indextts::audio::{mel_spectrogram, AudioConfig};
use std::time::Instant;
println!("\nMel-Spectrogram Benchmark");
println!("-------------------------");
let config = AudioConfig::default();
let num_samples = config.sample_rate as usize; // 1 second of audio
let signal: Vec<f32> = (0..num_samples)
.map(|i| (i as f32 * 0.01).sin())
.collect();
let start = Instant::now();
for _ in 0..iterations {
let _ = mel_spectrogram(&signal, &config);
}
let elapsed = start.elapsed();
let per_iter = elapsed.as_secs_f32() / iterations as f32;
println!(" Signal length: {} samples ({:.2}s)", num_samples, num_samples as f32 / config.sample_rate as f32);
println!(" Iterations: {}", iterations);
println!(" Total time: {:.3}s", elapsed.as_secs_f32());
println!(" Per iteration: {:.3}ms", per_iter * 1000.0);
println!(" Throughput: {:.1}x real-time", 1.0 / per_iter);
}
fn benchmark_tokenization(iterations: usize) {
use indextts::text::{TextNormalizer, TextTokenizer, TokenizerConfig};
use std::time::Instant;
println!("\nTokenization Benchmark");
println!("----------------------");
let normalizer = TextNormalizer::new();
let tokenizer = TextTokenizer::new(TokenizerConfig::default()).unwrap();
let test_texts = vec![
"Hello world, this is a test of the text-to-speech system.",
"The quick brown fox jumps over the lazy dog.",
"你好世界,这是一个测试。",
"Mixed language: Hello 世界 and 你好 world.",
];
let start = Instant::now();
for _ in 0..iterations {
for text in &test_texts {
let normalized = normalizer.normalize(text).unwrap();
let _tokens = tokenizer.encode(&normalized).unwrap();
}
}
let elapsed = start.elapsed();
let total_chars: usize = test_texts.iter().map(|t| t.len()).sum();
let per_iter = elapsed.as_secs_f32() / iterations as f32;
println!(" Texts: {}", test_texts.len());
println!(" Total characters: {}", total_chars);
println!(" Iterations: {}", iterations);
println!(" Total time: {:.3}s", elapsed.as_secs_f32());
println!(" Per iteration: {:.3}ms", per_iter * 1000.0);
println!(
" Throughput: {:.0} chars/sec",
(total_chars * iterations) as f32 / elapsed.as_secs_f32()
);
}
fn benchmark_vocoder(iterations: usize) {
use indextts::vocoder::{create_bigvgan_22k, Vocoder};
use ndarray::Array2;
use std::time::Instant;
println!("\nVocoder Benchmark");
println!("-----------------");
let vocoder = create_bigvgan_22k();
let num_frames = 100; // ~2.5 seconds of audio
let mel = Array2::zeros((80, num_frames));
let start = Instant::now();
for _ in 0..iterations {
let _ = vocoder.synthesize(&mel);
}
let elapsed = start.elapsed();
let audio_duration = num_frames as f32 * vocoder.hop_length() as f32 / vocoder.sample_rate() as f32;
let per_iter = elapsed.as_secs_f32() / iterations as f32;
println!(" Mel frames: {}", num_frames);
println!(" Audio duration: {:.2}s", audio_duration);
println!(" Iterations: {}", iterations);
println!(" Total time: {:.3}s", elapsed.as_secs_f32());
println!(" Per iteration: {:.3}ms", per_iter * 1000.0);
println!(" RTF: {:.3}x", per_iter / audio_duration);
}