Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Jun 24, 2024

Commit

89df94c

unverified ·

2 Parent(s): 2457c38 661e83c

Merge pull request #175 from jhj0517/feature/integrate-insanely_fast_whisper

Browse files

Files changed (6) hide show

app.py +111 -86
modules/faster_whisper_inference.py +1 -1
modules/insanely_fast_whisper_inference.py +181 -0
modules/whisper_Inference.py +1 -1
modules/whisper_parameter.py +52 -4
user-start-webui.bat +5 -1

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import argparse
 from modules.whisper_Inference import WhisperInference
 from modules.faster_whisper_inference import FasterWhisperInference
 from modules.nllb_inference import NLLBInference
 from ui.htmls import *
 from modules.youtube_manager import get_ytmetas
@@ -24,12 +25,16 @@ class App:
     def init_whisper(self):
         whisper_type = self.args.whisper_type.lower().strip()
-        if whisper_type in ["faster_whisper", "faster-whisper"]:
             whisper_inf = FasterWhisperInference()
             whisper_inf.model_dir = self.args.faster_whisper_model_dir
-        if whisper_type in ["whisper"]:
             whisper_inf = WhisperInference()
             whisper_inf.model_dir = self.args.whisper_model_dir
         else:
             whisper_inf = FasterWhisperInference()
             whisper_inf.model_dir = self.args.faster_whisper_model_dir
@@ -69,14 +74,6 @@ class App:
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
-                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
-                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
-                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
-                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
-                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
-                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
                     with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
@@ -88,6 +85,17 @@ class App:
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -96,26 +104,28 @@ class App:
                         btn_openfolder = gr.Button('📂', scale=1)
                     params = [input_file, dd_file_format, cb_timestamp]
-                    whisper_params = WhisperGradioComponents(model_size=dd_model,
-                                                             lang=dd_lang,
-                                                             is_translate=cb_translate,
-                                                             beam_size=nb_beam_size,
-                                                             log_prob_threshold=nb_log_prob_threshold,
-                                                             no_speech_threshold=nb_no_speech_threshold,
-                                                             compute_type=dd_compute_type,
-                                                             best_of=nb_best_of,
-                                                             patience=nb_patience,
-                                                             condition_on_previous_text=cb_condition_on_previous_text,
-                                                             initial_prompt=tb_initial_prompt,
-                                                             temperature=sd_temperature,
-                                                             compression_ratio_threshold=nb_compression_ratio_threshold,
-                                                             vad_filter=cb_vad_filter,
-                                                             threshold=sd_threshold,
-                                                             min_speech_duration_ms=nb_min_speech_duration_ms,
-                                                             max_speech_duration_s=nb_max_speech_duration_s,
-                                                             min_silence_duration_ms=nb_min_silence_duration_ms,
-                                                             window_size_sample=nb_window_size_sample,
-                                                             speech_pad_ms=nb_speech_pad_ms)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.to_list(),
@@ -143,14 +153,6 @@ class App:
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
-                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
-                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
-                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
-                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
-                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
-                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
                     with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
@@ -162,6 +164,18 @@ class App:
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -170,26 +184,29 @@ class App:
                         btn_openfolder = gr.Button('📂', scale=1)
                     params = [tb_youtubelink, dd_file_format, cb_timestamp]
-                    whisper_params = WhisperGradioComponents(model_size=dd_model,
-                                                             lang=dd_lang,
-                                                             is_translate=cb_translate,
-                                                             beam_size=nb_beam_size,
-                                                             log_prob_threshold=nb_log_prob_threshold,
-                                                             no_speech_threshold=nb_no_speech_threshold,
-                                                             compute_type=dd_compute_type,
-                                                             best_of=nb_best_of,
-                                                             patience=nb_patience,
-                                                             condition_on_previous_text=cb_condition_on_previous_text,
-                                                             initial_prompt=tb_initial_prompt,
-                                                             temperature=sd_temperature,
-                                                             compression_ratio_threshold=nb_compression_ratio_threshold,
-                                                             vad_filter=cb_vad_filter,
-                                                             threshold=sd_threshold,
-                                                             min_speech_duration_ms=nb_min_speech_duration_ms,
-                                                             max_speech_duration_s=nb_max_speech_duration_s,
-                                                             min_silence_duration_ms=nb_min_silence_duration_ms,
-                                                             window_size_sample=nb_window_size_sample,
-                                                             speech_pad_ms=nb_speech_pad_ms)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -209,14 +226,6 @@ class App:
                         dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
-                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
-                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
-                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
-                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
-                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
-                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
                     with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
@@ -227,6 +236,18 @@ class App:
                         cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -235,26 +256,29 @@ class App:
                         btn_openfolder = gr.Button('📂', scale=1)
                     params = [mic_input, dd_file_format]
-                    whisper_params = WhisperGradioComponents(model_size=dd_model,
-                                                             lang=dd_lang,
-                                                             is_translate=cb_translate,
-                                                             beam_size=nb_beam_size,
-                                                             log_prob_threshold=nb_log_prob_threshold,
-                                                             no_speech_threshold=nb_no_speech_threshold,
-                                                             compute_type=dd_compute_type,
-                                                             best_of=nb_best_of,
-                                                             patience=nb_patience,
-                                                             condition_on_previous_text=cb_condition_on_previous_text,
-                                                             initial_prompt=tb_initial_prompt,
-                                                             temperature=sd_temperature,
-                                                             compression_ratio_threshold=nb_compression_ratio_threshold,
-                                                             vad_filter=cb_vad_filter,
-                                                             threshold=sd_threshold,
-                                                             min_speech_duration_ms=nb_min_speech_duration_ms,
-                                                             max_speech_duration_s=nb_max_speech_duration_s,
-                                                             min_silence_duration_ms=nb_min_silence_duration_ms,
-                                                             window_size_sample=nb_window_size_sample,
-                                                             speech_pad_ms=nb_speech_pad_ms)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -354,6 +378,7 @@ parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True,
 parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
 parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
 _args = parser.parse_args()
 if __name__ == "__main__":

 from modules.whisper_Inference import WhisperInference
 from modules.faster_whisper_inference import FasterWhisperInference
+from modules.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 from modules.nllb_inference import NLLBInference
 from ui.htmls import *
 from modules.youtube_manager import get_ytmetas
     def init_whisper(self):
         whisper_type = self.args.whisper_type.lower().strip()
+        if whisper_type in ["faster_whisper", "faster-whisper", "fasterwhisper"]:
             whisper_inf = FasterWhisperInference()
             whisper_inf.model_dir = self.args.faster_whisper_model_dir
+        elif whisper_type in ["whisper"]:
             whisper_inf = WhisperInference()
             whisper_inf.model_dir = self.args.whisper_model_dir
+        elif whisper_type in ["insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
+                              "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"]:
+            whisper_inf = InsanelyFastWhisperInference()
+            whisper_inf.model_dir = self.args.insanely_fast_whisper_model_dir
         else:
             whisper_inf = FasterWhisperInference()
             whisper_inf.model_dir = self.args.faster_whisper_model_dir
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
                     with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
+                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
+                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
+                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
+                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
+                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
+                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
+                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
+                        nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
+                        nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         btn_openfolder = gr.Button('📂', scale=1)
                     params = [input_file, dd_file_format, cb_timestamp]
+                    whisper_params = WhisperParameters(model_size=dd_model,
+                                                       lang=dd_lang,
+                                                       is_translate=cb_translate,
+                                                       beam_size=nb_beam_size,
+                                                       log_prob_threshold=nb_log_prob_threshold,
+                                                       no_speech_threshold=nb_no_speech_threshold,
+                                                       compute_type=dd_compute_type,
+                                                       best_of=nb_best_of,
+                                                       patience=nb_patience,
+                                                       condition_on_previous_text=cb_condition_on_previous_text,
+                                                       initial_prompt=tb_initial_prompt,
+                                                       temperature=sd_temperature,
+                                                       compression_ratio_threshold=nb_compression_ratio_threshold,
+                                                       vad_filter=cb_vad_filter,
+                                                       threshold=sd_threshold,
+                                                       min_speech_duration_ms=nb_min_speech_duration_ms,
+                                                       max_speech_duration_s=nb_max_speech_duration_s,
+                                                       min_silence_duration_ms=nb_min_silence_duration_ms,
+                                                       window_size_sample=nb_window_size_sample,
+                                                       speech_pad_ms=nb_speech_pad_ms,
+                                                       chunk_length_s=nb_chunk_length_s,
+                                                       batch_size=nb_batch_size)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.to_list(),
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
                     with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
+                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
+                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
+                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
+                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
+                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
+                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
+                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
+                                      visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
+                        nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
+                        nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         btn_openfolder = gr.Button('📂', scale=1)
                     params = [tb_youtubelink, dd_file_format, cb_timestamp]
+                    whisper_params = WhisperParameters(model_size=dd_model,
+                                                       lang=dd_lang,
+                                                       is_translate=cb_translate,
+                                                       beam_size=nb_beam_size,
+                                                       log_prob_threshold=nb_log_prob_threshold,
+                                                       no_speech_threshold=nb_no_speech_threshold,
+                                                       compute_type=dd_compute_type,
+                                                       best_of=nb_best_of,
+                                                       patience=nb_patience,
+                                                       condition_on_previous_text=cb_condition_on_previous_text,
+                                                       initial_prompt=tb_initial_prompt,
+                                                       temperature=sd_temperature,
+                                                       compression_ratio_threshold=nb_compression_ratio_threshold,
+                                                       vad_filter=cb_vad_filter,
+                                                       threshold=sd_threshold,
+                                                       min_speech_duration_ms=nb_min_speech_duration_ms,
+                                                       max_speech_duration_s=nb_max_speech_duration_s,
+                                                       min_silence_duration_ms=nb_min_silence_duration_ms,
+                                                       window_size_sample=nb_window_size_sample,
+                                                       speech_pad_ms=nb_speech_pad_ms,
+                                                       chunk_length_s=nb_chunk_length_s,
+                                                       batch_size=nb_batch_size)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                         dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
+                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
+                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
+                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
+                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
+                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
+                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
+                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
+                                      visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
+                        nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
+                        nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         btn_openfolder = gr.Button('📂', scale=1)
                     params = [mic_input, dd_file_format]
+                    whisper_params = WhisperParameters(model_size=dd_model,
+                                                       lang=dd_lang,
+                                                       is_translate=cb_translate,
+                                                       beam_size=nb_beam_size,
+                                                       log_prob_threshold=nb_log_prob_threshold,
+                                                       no_speech_threshold=nb_no_speech_threshold,
+                                                       compute_type=dd_compute_type,
+                                                       best_of=nb_best_of,
+                                                       patience=nb_patience,
+                                                       condition_on_previous_text=cb_condition_on_previous_text,
+                                                       initial_prompt=tb_initial_prompt,
+                                                       temperature=sd_temperature,
+                                                       compression_ratio_threshold=nb_compression_ratio_threshold,
+                                                       vad_filter=cb_vad_filter,
+                                                       threshold=sd_threshold,
+                                                       min_speech_duration_ms=nb_min_speech_duration_ms,
+                                                       max_speech_duration_s=nb_max_speech_duration_s,
+                                                       min_silence_duration_ms=nb_min_silence_duration_ms,
+                                                       window_size_sample=nb_window_size_sample,
+                                                       speech_pad_ms=nb_speech_pad_ms,
+                                                       chunk_length_s=nb_chunk_length_s,
+                                                       batch_size=nb_batch_size)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
 parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
 parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
+parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
 _args = parser.parse_args()
 if __name__ == "__main__":

modules/faster_whisper_inference.py CHANGED Viewed

@@ -52,7 +52,7 @@ class FasterWhisperInference(WhisperBase):
         """
         start_time = time.time()
-        params = WhisperValues(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

         """
         start_time = time.time()
+        params = WhisperParameters.post_process(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

modules/insanely_fast_whisper_inference.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import os
+import time
+import numpy as np
+from typing import BinaryIO, Union, Tuple, List
+import torch
+from transformers import pipeline
+from transformers.utils import is_flash_attn_2_available
+import gradio as gr
+from huggingface_hub import hf_hub_download
+import whisper
+from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
+from modules.whisper_parameter import *
+from modules.whisper_base import WhisperBase
+class InsanelyFastWhisperInference(WhisperBase):
+    def __init__(self):
+        super().__init__(
+            model_dir=os.path.join("models", "Whisper", "insanely_fast_whisper")
+        )
+        openai_models = whisper.available_models()
+        distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
+        self.available_models = openai_models + distil_models
+        self.available_compute_types = ["float16"]
+    def transcribe(self,
+                   audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress,
+                   *whisper_params,
+                   ) -> Tuple[List[dict], float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        params = WhisperParameters.post_process(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        if params.lang == "Automatic Detection":
+            params.lang = None
+        else:
+            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            params.lang = language_code_dict[params.lang]
+        progress(0, desc="Transcribing...Progress is not shown in insanely-fast-whisper.")
+        with Progress(
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(style="yellow1", pulse_style="white"),
+                TimeElapsedColumn(),
+        ) as progress:
+            progress.add_task("[yellow]Transcribing...", total=None)
+            segments = self.model(
+                inputs=audio,
+                return_timestamps=True,
+                chunk_length_s=params.chunk_length_s,
+                batch_size=params.batch_size,
+                generate_kwargs={
+                    "language": params.lang,
+                    "task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+                    "no_speech_threshold": params.no_speech_threshold,
+                    "temperature": params.temperature,
+                    "compression_ratio_threshold": params.compression_ratio_threshold
+                }
+            )
+        segments_result = self.format_result(
+            transcribed_result=segments,
+        )
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress,
+                     ):
+        """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        """
+        progress(0, desc="Initializing Model..")
+        model_path = os.path.join(self.model_dir, model_size)
+        if not os.path.isdir(model_path) or not os.listdir(model_path):
+            self.download_model(
+                model_size=model_size,
+                download_root=model_path,
+                progress=progress
+            )
+        self.current_compute_type = compute_type
+        self.current_model_size = model_size
+        self.model = pipeline(
+            "automatic-speech-recognition",
+            model=os.path.join(self.model_dir, model_size),
+            torch_dtype=self.current_compute_type,
+            device=self.device,
+            model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
+        )
+    @staticmethod
+    def format_result(
+        transcribed_result: dict
+    ) -> List[dict]:
+        """
+        Format the transcription result of insanely_fast_whisper as the same with other implementation.
+        Parameters
+        ----------
+        transcribed_result: dict
+            Transcription result of the insanely_fast_whisper
+        Returns
+        ----------
+        result: List[dict]
+            Formatted result as the same with other implementation
+        """
+        result = transcribed_result["chunks"]
+        for item in result:
+            start, end = item["timestamp"][0], item["timestamp"][1]
+            if end is None:
+                end = start
+            item["start"] = start
+            item["end"] = end
+        return result
+    @staticmethod
+    def download_model(
+        model_size: str,
+        download_root: str,
+        progress: gr.Progress
+    ):
+        progress(0, 'Initializing model..')
+        print(f'Downloading {model_size} to "{download_root}"....')
+        os.makedirs(download_root, exist_ok=True)
+        download_list = [
+            "model.safetensors",
+            "config.json",
+            "generation_config.json",
+            "preprocessor_config.json",
+            "tokenizer.json",
+            "tokenizer_config.json",
+            "added_tokens.json",
+            "special_tokens_map.json",
+            "vocab.json",
+        ]
+        if model_size.startswith("distil"):
+            repo_id = f"distil-whisper/{model_size}"
+        else:
+            repo_id = f"openai/whisper-{model_size}"
+        for item in download_list:
+            hf_hub_download(repo_id=repo_id, filename=item, local_dir=download_root)

modules/whisper_Inference.py CHANGED Viewed

@@ -41,7 +41,7 @@ class WhisperInference(WhisperBase):
             elapsed time for transcription
         """
         start_time = time.time()
-        params = WhisperValues(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

             elapsed time for transcription
         """
         start_time = time.time()
+        params = WhisperParameters.post_process(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

modules/whisper_parameter.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Optional
 @dataclass
-class WhisperGradioComponents:
     model_size: gr.Dropdown
     lang: gr.Dropdown
     is_translate: gr.Checkbox
@@ -25,8 +25,12 @@ class WhisperGradioComponents:
     min_silence_duration_ms: gr.Number
     window_size_sample: gr.Number
     speech_pad_ms: gr.Number
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     See more about Gradio pre-processing: https://www.gradio.app/docs/components
     Attributes
@@ -111,11 +115,18 @@ class WhisperGradioComponents:
     speech_pad_ms: gr.Number
         This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
     """
     def to_list(self) -> list:
         """
-        Converts the data class attributes into a list. Use "before" Gradio pre-processing.
         See more about Gradio pre-processing: : https://www.gradio.app/docs/components
         Returns
@@ -124,6 +135,42 @@ class WhisperGradioComponents:
         """
         return [getattr(self, f.name) for f in fields(self)]
 @dataclass
 class WhisperValues:
@@ -147,7 +194,8 @@ class WhisperValues:
     min_silence_duration_ms: int
     window_size_samples: int
     speech_pad_ms: int
     """
-    A data class to use Whisper parameters. Use "after" Gradio pre-processing.
-    See more about Gradio pre-processing: : https://www.gradio.app/docs/components
     """

 @dataclass
+class WhisperParameters:
     model_size: gr.Dropdown
     lang: gr.Dropdown
     is_translate: gr.Checkbox
     min_silence_duration_ms: gr.Number
     window_size_sample: gr.Number
     speech_pad_ms: gr.Number
+    chunk_length_s: gr.Number
+    batch_size: gr.Number
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
+    This data class is used to mitigate the key-value problem between Gradio components and function parameters.
+    Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
     See more about Gradio pre-processing: https://www.gradio.app/docs/components
     Attributes
     speech_pad_ms: gr.Number
         This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
+    chunk_length_s: gr.Number
+        This parameter is related with insanely-fast-whisper pipe.
+        Maximum length of each chunk
+    batch_size: gr.Number
+        This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
     """
     def to_list(self) -> list:
         """
+        Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
         See more about Gradio pre-processing: : https://www.gradio.app/docs/components
         Returns
         """
         return [getattr(self, f.name) for f in fields(self)]
+    @staticmethod
+    def post_process(*args) -> 'WhisperValues':
+        """
+        To use Whisper parameters in function after Gradio post-processing.
+        See more about Gradio post-processing: : https://www.gradio.app/docs/components
+        Returns
+        ----------
+        WhisperValues
+           Data class that has values of parameters
+        """
+        return WhisperValues(
+            model_size=args[0],
+            lang=args[1],
+            is_translate=args[2],
+            beam_size=args[3],
+            log_prob_threshold=args[4],
+            no_speech_threshold=args[5],
+            compute_type=args[6],
+            best_of=args[7],
+            patience=args[8],
+            condition_on_previous_text=args[9],
+            initial_prompt=args[10],
+            temperature=args[11],
+            compression_ratio_threshold=args[12],
+            vad_filter=args[13],
+            threshold=args[14],
+            min_speech_duration_ms=args[15],
+            max_speech_duration_s=args[16],
+            min_silence_duration_ms=args[17],
+            window_size_samples=args[18],
+            speech_pad_ms=args[19],
+            chunk_length_s=args[20],
+            batch_size=args[21]
+        )
 @dataclass
 class WhisperValues:
     min_silence_duration_ms: int
     window_size_samples: int
     speech_pad_ms: int
+    chunk_length_s: int
+    batch_size: int
     """
+    A data class to use Whisper parameters.
     """

user-start-webui.bat CHANGED Viewed

@@ -12,6 +12,7 @@ set API_OPEN=
 set WHISPER_TYPE=
 set WHISPER_MODEL_DIR=
 set FASTER_WHISPER_MODEL_DIR=
 if not "%SERVER_NAME%"=="" (
@@ -47,7 +48,10 @@ if not "%WHISPER_MODEL_DIR%"=="" (
 if not "%FASTER_WHISPER_MODEL_DIR%"=="" (
     set FASTER_WHISPER_MODEL_DIR_ARG=--faster_whisper_model_dir "%FASTER_WHISPER_MODEL_DIR%"
 )
 :: Call the original .bat script with optional arguments
-start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %API_OPEN% %WHISPER_TYPE_ARG% %WHISPER_MODEL_DIR_ARG% %FASTER_WHISPER_MODEL_DIR_ARG%
 pause

 set WHISPER_TYPE=
 set WHISPER_MODEL_DIR=
 set FASTER_WHISPER_MODEL_DIR=
+set INSANELY_FAST_WHISPER_MODEL_DIR=
 if not "%SERVER_NAME%"=="" (
 if not "%FASTER_WHISPER_MODEL_DIR%"=="" (
     set FASTER_WHISPER_MODEL_DIR_ARG=--faster_whisper_model_dir "%FASTER_WHISPER_MODEL_DIR%"
 )
+if not "%INSANELY_FAST_WHISPER_MODEL_DIR%"=="" (
+    set INSANELY_FAST_WHISPER_MODEL_DIR_ARG=--insanely_fast_whisper_model_dir "%INSANELY_FAST_WHISPER_MODEL_DIR%"
+)
 :: Call the original .bat script with optional arguments
+start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %API_OPEN% %WHISPER_TYPE_ARG% %WHISPER_MODEL_DIR_ARG% %FASTER_WHISPER_MODEL_DIR_ARG% %INSANELY_FAST_WHISPER_MODEL_DIR_ARG%
 pause