Spaces:
Running
Running
jhj0517
commited on
Commit
·
899eb46
1
Parent(s):
edcb1e1
add Silero VAD Options
Browse files
modules/faster_whisper_inference.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import BinaryIO, Union, Tuple, List
|
|
| 6 |
from datetime import datetime
|
| 7 |
|
| 8 |
import faster_whisper
|
|
|
|
| 9 |
import ctranslate2
|
| 10 |
import whisper
|
| 11 |
import torch
|
|
@@ -260,6 +261,15 @@ class FasterWhisperInference(BaseInterface):
|
|
| 260 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
| 261 |
params.lang = language_code_dict[params.lang]
|
| 262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
segments, info = self.model.transcribe(
|
| 264 |
audio=audio,
|
| 265 |
language=params.lang,
|
|
@@ -272,6 +282,7 @@ class FasterWhisperInference(BaseInterface):
|
|
| 272 |
temperature=params.temperature,
|
| 273 |
compression_ratio_threshold=params.compression_ratio_threshold,
|
| 274 |
vad_filter=params.vad_filter,
|
|
|
|
| 275 |
)
|
| 276 |
progress(0, desc="Loading audio..")
|
| 277 |
|
|
|
|
| 6 |
from datetime import datetime
|
| 7 |
|
| 8 |
import faster_whisper
|
| 9 |
+
from faster_whisper.vad import VadOptions
|
| 10 |
import ctranslate2
|
| 11 |
import whisper
|
| 12 |
import torch
|
|
|
|
| 261 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
| 262 |
params.lang = language_code_dict[params.lang]
|
| 263 |
|
| 264 |
+
vad_options = VadOptions(
|
| 265 |
+
threshold=params.threshold,
|
| 266 |
+
min_speech_duration_ms=params.min_speech_duration_ms,
|
| 267 |
+
max_speech_duration_s=params.max_speech_duration_s,
|
| 268 |
+
min_silence_duration_ms=params.min_silence_duration_ms,
|
| 269 |
+
window_size_samples=params.window_size_samples,
|
| 270 |
+
speech_pad_ms=params.speech_pad_ms
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
segments, info = self.model.transcribe(
|
| 274 |
audio=audio,
|
| 275 |
language=params.lang,
|
|
|
|
| 282 |
temperature=params.temperature,
|
| 283 |
compression_ratio_threshold=params.compression_ratio_threshold,
|
| 284 |
vad_filter=params.vad_filter,
|
| 285 |
+
vad_parameters=vad_options
|
| 286 |
)
|
| 287 |
progress(0, desc="Loading audio..")
|
| 288 |
|
modules/whisper_parameter.py
CHANGED
|
@@ -19,6 +19,12 @@ class WhisperGradioComponents:
|
|
| 19 |
temperature: gr.Slider
|
| 20 |
compression_ratio_threshold: gr.Number
|
| 21 |
vad_filter: gr.Checkbox
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
"""
|
| 23 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
| 24 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
|
@@ -78,6 +84,33 @@ class WhisperGradioComponents:
|
|
| 78 |
Enable the voice activity detection (VAD) to filter out parts of the audio
|
| 79 |
without speech. This step is using the Silero VAD model
|
| 80 |
https://github.com/snakers4/silero-vad.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
"""
|
| 82 |
|
| 83 |
def to_list(self) -> list:
|
|
@@ -108,6 +141,12 @@ class WhisperValues:
|
|
| 108 |
temperature: float
|
| 109 |
compression_ratio_threshold: float
|
| 110 |
vad_filter: bool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
"""
|
| 112 |
A data class to use Whisper parameters. Use "after" Gradio pre-processing.
|
| 113 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|
|
|
|
| 19 |
temperature: gr.Slider
|
| 20 |
compression_ratio_threshold: gr.Number
|
| 21 |
vad_filter: gr.Checkbox
|
| 22 |
+
threshold: gr.Slider
|
| 23 |
+
min_speech_duration_ms: gr.Number
|
| 24 |
+
max_speech_duration_s: gr.Number
|
| 25 |
+
min_silence_duration_ms: gr.Number
|
| 26 |
+
window_size_sample: gr.Number
|
| 27 |
+
speech_pad_ms: gr.Number
|
| 28 |
"""
|
| 29 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
| 30 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
|
|
|
| 84 |
Enable the voice activity detection (VAD) to filter out parts of the audio
|
| 85 |
without speech. This step is using the Silero VAD model
|
| 86 |
https://github.com/snakers4/silero-vad.
|
| 87 |
+
|
| 88 |
+
threshold: gr.Slider
|
| 89 |
+
This parameter is related with Silero VAD. Speech threshold.
|
| 90 |
+
Silero VAD outputs speech probabilities for each audio chunk,
|
| 91 |
+
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
|
| 92 |
+
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
| 93 |
+
|
| 94 |
+
min_speech_duration_ms: gr.Number
|
| 95 |
+
This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
|
| 96 |
+
|
| 97 |
+
max_speech_duration_s: gr.Number
|
| 98 |
+
This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
|
| 99 |
+
than max_speech_duration_s will be split at the timestamp of the last silence that
|
| 100 |
+
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
|
| 101 |
+
split aggressively just before max_speech_duration_s.
|
| 102 |
+
|
| 103 |
+
min_silence_duration_ms: gr.Number
|
| 104 |
+
This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
|
| 105 |
+
before separating it
|
| 106 |
+
|
| 107 |
+
window_size_samples: gr.Number
|
| 108 |
+
This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
|
| 109 |
+
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
|
| 110 |
+
Values other than these may affect model performance!!
|
| 111 |
+
|
| 112 |
+
speech_pad_ms: gr.Number
|
| 113 |
+
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
|
| 114 |
"""
|
| 115 |
|
| 116 |
def to_list(self) -> list:
|
|
|
|
| 141 |
temperature: float
|
| 142 |
compression_ratio_threshold: float
|
| 143 |
vad_filter: bool
|
| 144 |
+
threshold: float
|
| 145 |
+
min_speech_duration_ms: int
|
| 146 |
+
max_speech_duration_s: float
|
| 147 |
+
min_silence_duration_ms: int
|
| 148 |
+
window_size_samples: int
|
| 149 |
+
speech_pad_ms: int
|
| 150 |
"""
|
| 151 |
A data class to use Whisper parameters. Use "after" Gradio pre-processing.
|
| 152 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|