Spaces:
Running
Running
Merge pull request #175 from jhj0517/feature/integrate-insanely_fast_whisper
Browse files- app.py +111 -86
- modules/faster_whisper_inference.py +1 -1
- modules/insanely_fast_whisper_inference.py +181 -0
- modules/whisper_Inference.py +1 -1
- modules/whisper_parameter.py +52 -4
- user-start-webui.bat +5 -1
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import argparse
|
|
| 4 |
|
| 5 |
from modules.whisper_Inference import WhisperInference
|
| 6 |
from modules.faster_whisper_inference import FasterWhisperInference
|
|
|
|
| 7 |
from modules.nllb_inference import NLLBInference
|
| 8 |
from ui.htmls import *
|
| 9 |
from modules.youtube_manager import get_ytmetas
|
|
@@ -24,12 +25,16 @@ class App:
|
|
| 24 |
def init_whisper(self):
|
| 25 |
whisper_type = self.args.whisper_type.lower().strip()
|
| 26 |
|
| 27 |
-
if whisper_type in ["faster_whisper", "faster-whisper"]:
|
| 28 |
whisper_inf = FasterWhisperInference()
|
| 29 |
whisper_inf.model_dir = self.args.faster_whisper_model_dir
|
| 30 |
-
|
| 31 |
whisper_inf = WhisperInference()
|
| 32 |
whisper_inf.model_dir = self.args.whisper_model_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
else:
|
| 34 |
whisper_inf = FasterWhisperInference()
|
| 35 |
whisper_inf.model_dir = self.args.faster_whisper_model_dir
|
|
@@ -69,14 +74,6 @@ class App:
|
|
| 69 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
| 70 |
with gr.Row():
|
| 71 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
| 72 |
-
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
| 73 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 74 |
-
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
| 75 |
-
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
| 76 |
-
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
| 77 |
-
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
| 78 |
-
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
| 79 |
-
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
| 80 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 81 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 82 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
|
@@ -88,6 +85,17 @@ class App:
|
|
| 88 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
| 89 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
| 90 |
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
with gr.Row():
|
| 92 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 93 |
with gr.Row():
|
|
@@ -96,26 +104,28 @@ class App:
|
|
| 96 |
btn_openfolder = gr.Button('📂', scale=1)
|
| 97 |
|
| 98 |
params = [input_file, dd_file_format, cb_timestamp]
|
| 99 |
-
whisper_params =
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
| 119 |
|
| 120 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
| 121 |
inputs=params + whisper_params.to_list(),
|
|
@@ -143,14 +153,6 @@ class App:
|
|
| 143 |
with gr.Row():
|
| 144 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
| 145 |
interactive=True)
|
| 146 |
-
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
| 147 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 148 |
-
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
| 149 |
-
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
| 150 |
-
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
| 151 |
-
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
| 152 |
-
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
| 153 |
-
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
| 154 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 155 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 156 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
|
@@ -162,6 +164,18 @@ class App:
|
|
| 162 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
| 163 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
| 164 |
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
with gr.Row():
|
| 166 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 167 |
with gr.Row():
|
|
@@ -170,26 +184,29 @@ class App:
|
|
| 170 |
btn_openfolder = gr.Button('📂', scale=1)
|
| 171 |
|
| 172 |
params = [tb_youtubelink, dd_file_format, cb_timestamp]
|
| 173 |
-
whisper_params =
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
| 193 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
| 194 |
inputs=params + whisper_params.to_list(),
|
| 195 |
outputs=[tb_indicator, files_subtitles])
|
|
@@ -209,14 +226,6 @@ class App:
|
|
| 209 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
| 210 |
with gr.Row():
|
| 211 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
| 212 |
-
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
| 213 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 214 |
-
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
| 215 |
-
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
| 216 |
-
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
| 217 |
-
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
| 218 |
-
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
| 219 |
-
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
| 220 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 221 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 222 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
|
@@ -227,6 +236,18 @@ class App:
|
|
| 227 |
cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
|
| 228 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
| 229 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
with gr.Row():
|
| 231 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 232 |
with gr.Row():
|
|
@@ -235,26 +256,29 @@ class App:
|
|
| 235 |
btn_openfolder = gr.Button('📂', scale=1)
|
| 236 |
|
| 237 |
params = [mic_input, dd_file_format]
|
| 238 |
-
whisper_params =
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
| 258 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
| 259 |
inputs=params + whisper_params.to_list(),
|
| 260 |
outputs=[tb_indicator, files_subtitles])
|
|
@@ -354,6 +378,7 @@ parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True,
|
|
| 354 |
parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
|
| 355 |
parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
|
| 356 |
parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
|
|
|
|
| 357 |
_args = parser.parse_args()
|
| 358 |
|
| 359 |
if __name__ == "__main__":
|
|
|
|
| 4 |
|
| 5 |
from modules.whisper_Inference import WhisperInference
|
| 6 |
from modules.faster_whisper_inference import FasterWhisperInference
|
| 7 |
+
from modules.insanely_fast_whisper_inference import InsanelyFastWhisperInference
|
| 8 |
from modules.nllb_inference import NLLBInference
|
| 9 |
from ui.htmls import *
|
| 10 |
from modules.youtube_manager import get_ytmetas
|
|
|
|
| 25 |
def init_whisper(self):
|
| 26 |
whisper_type = self.args.whisper_type.lower().strip()
|
| 27 |
|
| 28 |
+
if whisper_type in ["faster_whisper", "faster-whisper", "fasterwhisper"]:
|
| 29 |
whisper_inf = FasterWhisperInference()
|
| 30 |
whisper_inf.model_dir = self.args.faster_whisper_model_dir
|
| 31 |
+
elif whisper_type in ["whisper"]:
|
| 32 |
whisper_inf = WhisperInference()
|
| 33 |
whisper_inf.model_dir = self.args.whisper_model_dir
|
| 34 |
+
elif whisper_type in ["insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
|
| 35 |
+
"insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"]:
|
| 36 |
+
whisper_inf = InsanelyFastWhisperInference()
|
| 37 |
+
whisper_inf.model_dir = self.args.insanely_fast_whisper_model_dir
|
| 38 |
else:
|
| 39 |
whisper_inf = FasterWhisperInference()
|
| 40 |
whisper_inf.model_dir = self.args.faster_whisper_model_dir
|
|
|
|
| 74 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
| 75 |
with gr.Row():
|
| 76 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 78 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 79 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
|
|
|
| 85 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
| 86 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
| 87 |
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
|
| 88 |
+
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
| 89 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 90 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
| 91 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
| 92 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
| 93 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
| 94 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
| 95 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
| 96 |
+
with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
|
| 97 |
+
nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
|
| 98 |
+
nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
|
| 99 |
with gr.Row():
|
| 100 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 101 |
with gr.Row():
|
|
|
|
| 104 |
btn_openfolder = gr.Button('📂', scale=1)
|
| 105 |
|
| 106 |
params = [input_file, dd_file_format, cb_timestamp]
|
| 107 |
+
whisper_params = WhisperParameters(model_size=dd_model,
|
| 108 |
+
lang=dd_lang,
|
| 109 |
+
is_translate=cb_translate,
|
| 110 |
+
beam_size=nb_beam_size,
|
| 111 |
+
log_prob_threshold=nb_log_prob_threshold,
|
| 112 |
+
no_speech_threshold=nb_no_speech_threshold,
|
| 113 |
+
compute_type=dd_compute_type,
|
| 114 |
+
best_of=nb_best_of,
|
| 115 |
+
patience=nb_patience,
|
| 116 |
+
condition_on_previous_text=cb_condition_on_previous_text,
|
| 117 |
+
initial_prompt=tb_initial_prompt,
|
| 118 |
+
temperature=sd_temperature,
|
| 119 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 120 |
+
vad_filter=cb_vad_filter,
|
| 121 |
+
threshold=sd_threshold,
|
| 122 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
| 123 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
| 124 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
| 125 |
+
window_size_sample=nb_window_size_sample,
|
| 126 |
+
speech_pad_ms=nb_speech_pad_ms,
|
| 127 |
+
chunk_length_s=nb_chunk_length_s,
|
| 128 |
+
batch_size=nb_batch_size)
|
| 129 |
|
| 130 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
| 131 |
inputs=params + whisper_params.to_list(),
|
|
|
|
| 153 |
with gr.Row():
|
| 154 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
| 155 |
interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 157 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 158 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
|
|
|
| 164 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
| 165 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
| 166 |
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
|
| 167 |
+
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
| 168 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 169 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
| 170 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
| 171 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
| 172 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
| 173 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
| 174 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
| 175 |
+
with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
|
| 176 |
+
visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
|
| 177 |
+
nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
|
| 178 |
+
nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
|
| 179 |
with gr.Row():
|
| 180 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 181 |
with gr.Row():
|
|
|
|
| 184 |
btn_openfolder = gr.Button('📂', scale=1)
|
| 185 |
|
| 186 |
params = [tb_youtubelink, dd_file_format, cb_timestamp]
|
| 187 |
+
whisper_params = WhisperParameters(model_size=dd_model,
|
| 188 |
+
lang=dd_lang,
|
| 189 |
+
is_translate=cb_translate,
|
| 190 |
+
beam_size=nb_beam_size,
|
| 191 |
+
log_prob_threshold=nb_log_prob_threshold,
|
| 192 |
+
no_speech_threshold=nb_no_speech_threshold,
|
| 193 |
+
compute_type=dd_compute_type,
|
| 194 |
+
best_of=nb_best_of,
|
| 195 |
+
patience=nb_patience,
|
| 196 |
+
condition_on_previous_text=cb_condition_on_previous_text,
|
| 197 |
+
initial_prompt=tb_initial_prompt,
|
| 198 |
+
temperature=sd_temperature,
|
| 199 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 200 |
+
vad_filter=cb_vad_filter,
|
| 201 |
+
threshold=sd_threshold,
|
| 202 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
| 203 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
| 204 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
| 205 |
+
window_size_sample=nb_window_size_sample,
|
| 206 |
+
speech_pad_ms=nb_speech_pad_ms,
|
| 207 |
+
chunk_length_s=nb_chunk_length_s,
|
| 208 |
+
batch_size=nb_batch_size)
|
| 209 |
+
|
| 210 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
| 211 |
inputs=params + whisper_params.to_list(),
|
| 212 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 226 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
| 227 |
with gr.Row():
|
| 228 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 230 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 231 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
|
|
|
| 236 |
cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
|
| 237 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
| 238 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
| 239 |
+
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
| 240 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 241 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
| 242 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
| 243 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
| 244 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
| 245 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
| 246 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
| 247 |
+
with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
|
| 248 |
+
visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
|
| 249 |
+
nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
|
| 250 |
+
nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
|
| 251 |
with gr.Row():
|
| 252 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 253 |
with gr.Row():
|
|
|
|
| 256 |
btn_openfolder = gr.Button('📂', scale=1)
|
| 257 |
|
| 258 |
params = [mic_input, dd_file_format]
|
| 259 |
+
whisper_params = WhisperParameters(model_size=dd_model,
|
| 260 |
+
lang=dd_lang,
|
| 261 |
+
is_translate=cb_translate,
|
| 262 |
+
beam_size=nb_beam_size,
|
| 263 |
+
log_prob_threshold=nb_log_prob_threshold,
|
| 264 |
+
no_speech_threshold=nb_no_speech_threshold,
|
| 265 |
+
compute_type=dd_compute_type,
|
| 266 |
+
best_of=nb_best_of,
|
| 267 |
+
patience=nb_patience,
|
| 268 |
+
condition_on_previous_text=cb_condition_on_previous_text,
|
| 269 |
+
initial_prompt=tb_initial_prompt,
|
| 270 |
+
temperature=sd_temperature,
|
| 271 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 272 |
+
vad_filter=cb_vad_filter,
|
| 273 |
+
threshold=sd_threshold,
|
| 274 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
| 275 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
| 276 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
| 277 |
+
window_size_sample=nb_window_size_sample,
|
| 278 |
+
speech_pad_ms=nb_speech_pad_ms,
|
| 279 |
+
chunk_length_s=nb_chunk_length_s,
|
| 280 |
+
batch_size=nb_batch_size)
|
| 281 |
+
|
| 282 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
| 283 |
inputs=params + whisper_params.to_list(),
|
| 284 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 378 |
parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
|
| 379 |
parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
|
| 380 |
parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
|
| 381 |
+
parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
|
| 382 |
_args = parser.parse_args()
|
| 383 |
|
| 384 |
if __name__ == "__main__":
|
modules/faster_whisper_inference.py
CHANGED
|
@@ -52,7 +52,7 @@ class FasterWhisperInference(WhisperBase):
|
|
| 52 |
"""
|
| 53 |
start_time = time.time()
|
| 54 |
|
| 55 |
-
params =
|
| 56 |
|
| 57 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
| 58 |
self.update_model(params.model_size, params.compute_type, progress)
|
|
|
|
| 52 |
"""
|
| 53 |
start_time = time.time()
|
| 54 |
|
| 55 |
+
params = WhisperParameters.post_process(*whisper_params)
|
| 56 |
|
| 57 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
| 58 |
self.update_model(params.model_size, params.compute_type, progress)
|
modules/insanely_fast_whisper_inference.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import numpy as np
|
| 4 |
+
from typing import BinaryIO, Union, Tuple, List
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import pipeline
|
| 7 |
+
from transformers.utils import is_flash_attn_2_available
|
| 8 |
+
import gradio as gr
|
| 9 |
+
from huggingface_hub import hf_hub_download
|
| 10 |
+
import whisper
|
| 11 |
+
from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
|
| 12 |
+
|
| 13 |
+
from modules.whisper_parameter import *
|
| 14 |
+
from modules.whisper_base import WhisperBase
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class InsanelyFastWhisperInference(WhisperBase):
|
| 18 |
+
def __init__(self):
|
| 19 |
+
super().__init__(
|
| 20 |
+
model_dir=os.path.join("models", "Whisper", "insanely_fast_whisper")
|
| 21 |
+
)
|
| 22 |
+
openai_models = whisper.available_models()
|
| 23 |
+
distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
|
| 24 |
+
self.available_models = openai_models + distil_models
|
| 25 |
+
self.available_compute_types = ["float16"]
|
| 26 |
+
|
| 27 |
+
def transcribe(self,
|
| 28 |
+
audio: Union[str, np.ndarray, torch.Tensor],
|
| 29 |
+
progress: gr.Progress,
|
| 30 |
+
*whisper_params,
|
| 31 |
+
) -> Tuple[List[dict], float]:
|
| 32 |
+
"""
|
| 33 |
+
transcribe method for faster-whisper.
|
| 34 |
+
|
| 35 |
+
Parameters
|
| 36 |
+
----------
|
| 37 |
+
audio: Union[str, BinaryIO, np.ndarray]
|
| 38 |
+
Audio path or file binary or Audio numpy array
|
| 39 |
+
progress: gr.Progress
|
| 40 |
+
Indicator to show progress directly in gradio.
|
| 41 |
+
*whisper_params: tuple
|
| 42 |
+
Gradio components related to Whisper. see whisper_data_class.py for details.
|
| 43 |
+
|
| 44 |
+
Returns
|
| 45 |
+
----------
|
| 46 |
+
segments_result: List[dict]
|
| 47 |
+
list of dicts that includes start, end timestamps and transcribed text
|
| 48 |
+
elapsed_time: float
|
| 49 |
+
elapsed time for transcription
|
| 50 |
+
"""
|
| 51 |
+
start_time = time.time()
|
| 52 |
+
params = WhisperParameters.post_process(*whisper_params)
|
| 53 |
+
|
| 54 |
+
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
| 55 |
+
self.update_model(params.model_size, params.compute_type, progress)
|
| 56 |
+
|
| 57 |
+
if params.lang == "Automatic Detection":
|
| 58 |
+
params.lang = None
|
| 59 |
+
else:
|
| 60 |
+
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
| 61 |
+
params.lang = language_code_dict[params.lang]
|
| 62 |
+
|
| 63 |
+
progress(0, desc="Transcribing...Progress is not shown in insanely-fast-whisper.")
|
| 64 |
+
with Progress(
|
| 65 |
+
TextColumn("[progress.description]{task.description}"),
|
| 66 |
+
BarColumn(style="yellow1", pulse_style="white"),
|
| 67 |
+
TimeElapsedColumn(),
|
| 68 |
+
) as progress:
|
| 69 |
+
progress.add_task("[yellow]Transcribing...", total=None)
|
| 70 |
+
|
| 71 |
+
segments = self.model(
|
| 72 |
+
inputs=audio,
|
| 73 |
+
return_timestamps=True,
|
| 74 |
+
chunk_length_s=params.chunk_length_s,
|
| 75 |
+
batch_size=params.batch_size,
|
| 76 |
+
generate_kwargs={
|
| 77 |
+
"language": params.lang,
|
| 78 |
+
"task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
|
| 79 |
+
"no_speech_threshold": params.no_speech_threshold,
|
| 80 |
+
"temperature": params.temperature,
|
| 81 |
+
"compression_ratio_threshold": params.compression_ratio_threshold
|
| 82 |
+
}
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
segments_result = self.format_result(
|
| 86 |
+
transcribed_result=segments,
|
| 87 |
+
)
|
| 88 |
+
elapsed_time = time.time() - start_time
|
| 89 |
+
return segments_result, elapsed_time
|
| 90 |
+
|
| 91 |
+
def update_model(self,
|
| 92 |
+
model_size: str,
|
| 93 |
+
compute_type: str,
|
| 94 |
+
progress: gr.Progress,
|
| 95 |
+
):
|
| 96 |
+
"""
|
| 97 |
+
Update current model setting
|
| 98 |
+
|
| 99 |
+
Parameters
|
| 100 |
+
----------
|
| 101 |
+
model_size: str
|
| 102 |
+
Size of whisper model
|
| 103 |
+
compute_type: str
|
| 104 |
+
Compute type for transcription.
|
| 105 |
+
see more info : https://opennmt.net/CTranslate2/quantization.html
|
| 106 |
+
progress: gr.Progress
|
| 107 |
+
Indicator to show progress directly in gradio.
|
| 108 |
+
"""
|
| 109 |
+
progress(0, desc="Initializing Model..")
|
| 110 |
+
model_path = os.path.join(self.model_dir, model_size)
|
| 111 |
+
if not os.path.isdir(model_path) or not os.listdir(model_path):
|
| 112 |
+
self.download_model(
|
| 113 |
+
model_size=model_size,
|
| 114 |
+
download_root=model_path,
|
| 115 |
+
progress=progress
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
self.current_compute_type = compute_type
|
| 119 |
+
self.current_model_size = model_size
|
| 120 |
+
self.model = pipeline(
|
| 121 |
+
"automatic-speech-recognition",
|
| 122 |
+
model=os.path.join(self.model_dir, model_size),
|
| 123 |
+
torch_dtype=self.current_compute_type,
|
| 124 |
+
device=self.device,
|
| 125 |
+
model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
@staticmethod
|
| 129 |
+
def format_result(
|
| 130 |
+
transcribed_result: dict
|
| 131 |
+
) -> List[dict]:
|
| 132 |
+
"""
|
| 133 |
+
Format the transcription result of insanely_fast_whisper as the same with other implementation.
|
| 134 |
+
|
| 135 |
+
Parameters
|
| 136 |
+
----------
|
| 137 |
+
transcribed_result: dict
|
| 138 |
+
Transcription result of the insanely_fast_whisper
|
| 139 |
+
|
| 140 |
+
Returns
|
| 141 |
+
----------
|
| 142 |
+
result: List[dict]
|
| 143 |
+
Formatted result as the same with other implementation
|
| 144 |
+
"""
|
| 145 |
+
result = transcribed_result["chunks"]
|
| 146 |
+
for item in result:
|
| 147 |
+
start, end = item["timestamp"][0], item["timestamp"][1]
|
| 148 |
+
if end is None:
|
| 149 |
+
end = start
|
| 150 |
+
item["start"] = start
|
| 151 |
+
item["end"] = end
|
| 152 |
+
return result
|
| 153 |
+
|
| 154 |
+
@staticmethod
|
| 155 |
+
def download_model(
|
| 156 |
+
model_size: str,
|
| 157 |
+
download_root: str,
|
| 158 |
+
progress: gr.Progress
|
| 159 |
+
):
|
| 160 |
+
progress(0, 'Initializing model..')
|
| 161 |
+
print(f'Downloading {model_size} to "{download_root}"....')
|
| 162 |
+
|
| 163 |
+
os.makedirs(download_root, exist_ok=True)
|
| 164 |
+
download_list = [
|
| 165 |
+
"model.safetensors",
|
| 166 |
+
"config.json",
|
| 167 |
+
"generation_config.json",
|
| 168 |
+
"preprocessor_config.json",
|
| 169 |
+
"tokenizer.json",
|
| 170 |
+
"tokenizer_config.json",
|
| 171 |
+
"added_tokens.json",
|
| 172 |
+
"special_tokens_map.json",
|
| 173 |
+
"vocab.json",
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
if model_size.startswith("distil"):
|
| 177 |
+
repo_id = f"distil-whisper/{model_size}"
|
| 178 |
+
else:
|
| 179 |
+
repo_id = f"openai/whisper-{model_size}"
|
| 180 |
+
for item in download_list:
|
| 181 |
+
hf_hub_download(repo_id=repo_id, filename=item, local_dir=download_root)
|
modules/whisper_Inference.py
CHANGED
|
@@ -41,7 +41,7 @@ class WhisperInference(WhisperBase):
|
|
| 41 |
elapsed time for transcription
|
| 42 |
"""
|
| 43 |
start_time = time.time()
|
| 44 |
-
params =
|
| 45 |
|
| 46 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
| 47 |
self.update_model(params.model_size, params.compute_type, progress)
|
|
|
|
| 41 |
elapsed time for transcription
|
| 42 |
"""
|
| 43 |
start_time = time.time()
|
| 44 |
+
params = WhisperParameters.post_process(*whisper_params)
|
| 45 |
|
| 46 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
| 47 |
self.update_model(params.model_size, params.compute_type, progress)
|
modules/whisper_parameter.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import Optional
|
|
| 4 |
|
| 5 |
|
| 6 |
@dataclass
|
| 7 |
-
class
|
| 8 |
model_size: gr.Dropdown
|
| 9 |
lang: gr.Dropdown
|
| 10 |
is_translate: gr.Checkbox
|
|
@@ -25,8 +25,12 @@ class WhisperGradioComponents:
|
|
| 25 |
min_silence_duration_ms: gr.Number
|
| 26 |
window_size_sample: gr.Number
|
| 27 |
speech_pad_ms: gr.Number
|
|
|
|
|
|
|
| 28 |
"""
|
| 29 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
|
|
|
|
|
|
| 30 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
| 31 |
|
| 32 |
Attributes
|
|
@@ -111,11 +115,18 @@ class WhisperGradioComponents:
|
|
| 111 |
|
| 112 |
speech_pad_ms: gr.Number
|
| 113 |
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
"""
|
| 115 |
|
| 116 |
def to_list(self) -> list:
|
| 117 |
"""
|
| 118 |
-
Converts the data class attributes into a list
|
| 119 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|
| 120 |
|
| 121 |
Returns
|
|
@@ -124,6 +135,42 @@ class WhisperGradioComponents:
|
|
| 124 |
"""
|
| 125 |
return [getattr(self, f.name) for f in fields(self)]
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
@dataclass
|
| 129 |
class WhisperValues:
|
|
@@ -147,7 +194,8 @@ class WhisperValues:
|
|
| 147 |
min_silence_duration_ms: int
|
| 148 |
window_size_samples: int
|
| 149 |
speech_pad_ms: int
|
|
|
|
|
|
|
| 150 |
"""
|
| 151 |
-
A data class to use Whisper parameters.
|
| 152 |
-
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|
| 153 |
"""
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
@dataclass
|
| 7 |
+
class WhisperParameters:
|
| 8 |
model_size: gr.Dropdown
|
| 9 |
lang: gr.Dropdown
|
| 10 |
is_translate: gr.Checkbox
|
|
|
|
| 25 |
min_silence_duration_ms: gr.Number
|
| 26 |
window_size_sample: gr.Number
|
| 27 |
speech_pad_ms: gr.Number
|
| 28 |
+
chunk_length_s: gr.Number
|
| 29 |
+
batch_size: gr.Number
|
| 30 |
"""
|
| 31 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
| 32 |
+
This data class is used to mitigate the key-value problem between Gradio components and function parameters.
|
| 33 |
+
Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
|
| 34 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
| 35 |
|
| 36 |
Attributes
|
|
|
|
| 115 |
|
| 116 |
speech_pad_ms: gr.Number
|
| 117 |
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
|
| 118 |
+
|
| 119 |
+
chunk_length_s: gr.Number
|
| 120 |
+
This parameter is related with insanely-fast-whisper pipe.
|
| 121 |
+
Maximum length of each chunk
|
| 122 |
+
|
| 123 |
+
batch_size: gr.Number
|
| 124 |
+
This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
|
| 125 |
"""
|
| 126 |
|
| 127 |
def to_list(self) -> list:
|
| 128 |
"""
|
| 129 |
+
Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
|
| 130 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|
| 131 |
|
| 132 |
Returns
|
|
|
|
| 135 |
"""
|
| 136 |
return [getattr(self, f.name) for f in fields(self)]
|
| 137 |
|
| 138 |
+
@staticmethod
|
| 139 |
+
def post_process(*args) -> 'WhisperValues':
|
| 140 |
+
"""
|
| 141 |
+
To use Whisper parameters in function after Gradio post-processing.
|
| 142 |
+
See more about Gradio post-processing: : https://www.gradio.app/docs/components
|
| 143 |
+
|
| 144 |
+
Returns
|
| 145 |
+
----------
|
| 146 |
+
WhisperValues
|
| 147 |
+
Data class that has values of parameters
|
| 148 |
+
"""
|
| 149 |
+
return WhisperValues(
|
| 150 |
+
model_size=args[0],
|
| 151 |
+
lang=args[1],
|
| 152 |
+
is_translate=args[2],
|
| 153 |
+
beam_size=args[3],
|
| 154 |
+
log_prob_threshold=args[4],
|
| 155 |
+
no_speech_threshold=args[5],
|
| 156 |
+
compute_type=args[6],
|
| 157 |
+
best_of=args[7],
|
| 158 |
+
patience=args[8],
|
| 159 |
+
condition_on_previous_text=args[9],
|
| 160 |
+
initial_prompt=args[10],
|
| 161 |
+
temperature=args[11],
|
| 162 |
+
compression_ratio_threshold=args[12],
|
| 163 |
+
vad_filter=args[13],
|
| 164 |
+
threshold=args[14],
|
| 165 |
+
min_speech_duration_ms=args[15],
|
| 166 |
+
max_speech_duration_s=args[16],
|
| 167 |
+
min_silence_duration_ms=args[17],
|
| 168 |
+
window_size_samples=args[18],
|
| 169 |
+
speech_pad_ms=args[19],
|
| 170 |
+
chunk_length_s=args[20],
|
| 171 |
+
batch_size=args[21]
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
|
| 175 |
@dataclass
|
| 176 |
class WhisperValues:
|
|
|
|
| 194 |
min_silence_duration_ms: int
|
| 195 |
window_size_samples: int
|
| 196 |
speech_pad_ms: int
|
| 197 |
+
chunk_length_s: int
|
| 198 |
+
batch_size: int
|
| 199 |
"""
|
| 200 |
+
A data class to use Whisper parameters.
|
|
|
|
| 201 |
"""
|
user-start-webui.bat
CHANGED
|
@@ -12,6 +12,7 @@ set API_OPEN=
|
|
| 12 |
set WHISPER_TYPE=
|
| 13 |
set WHISPER_MODEL_DIR=
|
| 14 |
set FASTER_WHISPER_MODEL_DIR=
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
if not "%SERVER_NAME%"=="" (
|
|
@@ -47,7 +48,10 @@ if not "%WHISPER_MODEL_DIR%"=="" (
|
|
| 47 |
if not "%FASTER_WHISPER_MODEL_DIR%"=="" (
|
| 48 |
set FASTER_WHISPER_MODEL_DIR_ARG=--faster_whisper_model_dir "%FASTER_WHISPER_MODEL_DIR%"
|
| 49 |
)
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
:: Call the original .bat script with optional arguments
|
| 52 |
-
start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %API_OPEN% %WHISPER_TYPE_ARG% %WHISPER_MODEL_DIR_ARG% %FASTER_WHISPER_MODEL_DIR_ARG%
|
| 53 |
pause
|
|
|
|
| 12 |
set WHISPER_TYPE=
|
| 13 |
set WHISPER_MODEL_DIR=
|
| 14 |
set FASTER_WHISPER_MODEL_DIR=
|
| 15 |
+
set INSANELY_FAST_WHISPER_MODEL_DIR=
|
| 16 |
|
| 17 |
|
| 18 |
if not "%SERVER_NAME%"=="" (
|
|
|
|
| 48 |
if not "%FASTER_WHISPER_MODEL_DIR%"=="" (
|
| 49 |
set FASTER_WHISPER_MODEL_DIR_ARG=--faster_whisper_model_dir "%FASTER_WHISPER_MODEL_DIR%"
|
| 50 |
)
|
| 51 |
+
if not "%INSANELY_FAST_WHISPER_MODEL_DIR%"=="" (
|
| 52 |
+
set INSANELY_FAST_WHISPER_MODEL_DIR_ARG=--insanely_fast_whisper_model_dir "%INSANELY_FAST_WHISPER_MODEL_DIR%"
|
| 53 |
+
)
|
| 54 |
|
| 55 |
:: Call the original .bat script with optional arguments
|
| 56 |
+
start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %API_OPEN% %WHISPER_TYPE_ARG% %WHISPER_MODEL_DIR_ARG% %FASTER_WHISPER_MODEL_DIR_ARG% %INSANELY_FAST_WHISPER_MODEL_DIR_ARG%
|
| 57 |
pause
|