Noumida commited on
Commit
418029a
Β·
verified Β·
1 Parent(s): 6d9d341

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +257 -158
app.py CHANGED
@@ -3,13 +3,13 @@
3
 
4
  """
5
  Multilingual Voice-Based Agricultural Recommendation System
 
6
  Optimized for Hugging Face Spaces deployment with Whisper-first pipeline
7
- Now using IndicTransToolkit for proper IndicTrans2 implementation
8
  """
9
 
10
  from __future__ import annotations
11
  import torch
12
- import torchaudio
13
  import json
14
  import os
15
  import re
@@ -21,8 +21,7 @@ from typing import List, Dict, Optional, Union
21
  from transformers import AutoModel, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
22
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
23
  from transformers import AutoTokenizer, PretrainedConfig, PreTrainedModel
24
- from transformers import AutoModelForSeq2SeqLM # Added for IndicTrans2
25
- from IndicTransToolkit.processor import IndicProcessor # Official IndicTrans2 processor
26
  from pathlib import Path
27
  import torch.nn as nn
28
  from transformers import Gemma3ForCausalLM, Gemma3TextConfig
@@ -34,22 +33,75 @@ from transformers.models.gemma3.modeling_gemma3 import (
34
  from transformers.modeling_outputs import TokenClassifierOutput
35
  from transformers.utils import logging
36
  from sentence_transformers import SentenceTransformer, util
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  logger = logging.get_logger(__name__)
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
 
41
  # --- CONFIGURATION ---
42
- # Hugging Face Token (optional, can be set as environment variable)
43
- # HF_TOKEN = os.getenv("HF_TOKEN", "")
44
-
45
- # # Login to Hugging Face if token is provided
46
- # if HF_TOKEN:
47
- # from huggingface_hub import login
48
- # try:
49
- # login(HF_TOKEN)
50
- # print("βœ… Successfully logged in to Hugging Face!")
51
- # except Exception as e:
52
- # print(f"⚠️ HF login failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # --- CUSTOM GEMMA3 BIDIRECTIONAL MODEL FOR PUNCTUATION ---
55
  class Gemma3PunctuationConfig(Gemma3TextConfig):
@@ -273,100 +325,130 @@ SUPPORTED_AUDIO_FORMATS = {
273
  def detect_audio_format(audio_path: str) -> str:
274
  return Path(audio_path).suffix.lower()
275
 
276
- def get_optimal_backend(audio_format: str) -> str:
277
- ffmpeg_formats = {'.mp3', '.opus', '.m4a', '.aac', '.mp4', '.webm', '.3gp', '.mpeg'}
278
  try:
279
- backends = torchaudio.list_audio_backends()
280
- if 'ffmpeg' in backends and audio_format in ffmpeg_formats:
281
- return 'ffmpeg'
282
- elif 'sox_io' in backends:
283
- return 'sox_io'
284
- elif 'soundfile' in backends:
285
- return 'soundfile'
286
- else:
287
- return None
288
- except:
289
- return None
290
-
291
- def convert_to_mono(waveform: torch.Tensor) -> torch.Tensor:
292
- if waveform.shape[0] > 1:
293
- waveform = torch.mean(waveform, dim=0, keepdim=True)
294
- print(f"πŸ”„ Converted from stereo to mono")
295
- else:
296
- print("πŸ“» Audio is already mono")
297
- return waveform
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
- def preprocess_audio(audio_path: str, target_sr: int = 16000) -> tuple:
 
300
  try:
301
- original_audio_format = detect_audio_format(audio_path)
302
- print(f"🎡 Detected original format: {original_audio_format}")
303
-
304
- current_audio_path = audio_path
305
- temp_mp3_path = None
306
-
307
- if original_audio_format in {'.mpeg', '.opus'}:
308
- print(f"πŸ”„ Converting {original_audio_format} to .mp3...")
309
- temp_mp3_fd, temp_mp3_path = tempfile.mkstemp(suffix=".mp3", prefix="audio_converted_")
310
- os.close(temp_mp3_fd)
 
 
 
 
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  try:
313
- backend = get_optimal_backend(original_audio_format)
314
  if backend:
 
 
 
315
  waveform, orig_sr = torchaudio.load(audio_path, backend=backend)
316
  else:
 
317
  waveform, orig_sr = torchaudio.load(audio_path)
318
-
319
- torchaudio.save(temp_mp3_path, waveform, orig_sr, format="mp3")
320
- current_audio_path = temp_mp3_path
321
- print(f"βœ… Converted to .mp3: {current_audio_path}")
322
-
323
- except Exception as conversion_error:
324
- print(f"❌ Error during conversion to .mp3: {conversion_error}")
325
- print("⚠️ Could not convert to .mp3, attempting to process original file.")
326
- current_audio_path = audio_path
327
- if temp_mp3_path and os.path.exists(temp_mp3_path):
328
- os.unlink(temp_mp3_path)
329
- temp_mp3_path = None
330
-
331
- audio_format_to_process = detect_audio_format(current_audio_path)
332
- print(f"🎡 Processing format: {audio_format_to_process}")
333
-
334
- if audio_format_to_process not in SUPPORTED_AUDIO_FORMATS:
335
- print(f"⚠️ Warning: {audio_format_to_process} may not be fully supported")
336
-
337
- backend = get_optimal_backend(audio_format_to_process)
338
- if backend:
339
- print(f"πŸ”§ Using {backend} backend for {audio_format_to_process}")
340
-
341
- try:
342
- if backend:
343
- waveform, orig_sr = torchaudio.load(current_audio_path, backend=backend)
344
- else:
345
- waveform, orig_sr = torchaudio.load(current_audio_path)
346
- except Exception as load_error:
347
- print(f"⚠️ Primary load method failed: {str(load_error)}")
348
- print("πŸ”„ Trying alternative loading method...")
349
-
350
- for fallback_backend in ['ffmpeg', 'sox_io', 'soundfile']:
351
- try:
352
- backends = torchaudio.list_audio_backends()
353
- if fallback_backend in backends:
354
- print(f"πŸ”„ Trying {fallback_backend} backend...")
355
- waveform, orig_sr = torchaudio.load(current_audio_path, backend=fallback_backend)
356
- print(f"βœ… Successfully loaded with {fallback_backend} backend")
357
- break
358
- except Exception as e:
359
- continue
360
- else:
361
- try:
362
- waveform, orig_sr = torchaudio.load(current_audio_path)
363
- print("βœ… Loaded with default backend")
364
- except Exception as final_error:
365
- raise Exception(f"Failed to load audio file with any backend: {final_error}")
366
-
367
  print(f"🎡 Loaded audio: {waveform.shape} at {orig_sr} Hz")
368
- waveform = convert_to_mono(waveform)
369
-
 
 
 
 
 
370
  if orig_sr != target_sr:
371
  print(f"πŸ”„ Resampling from {orig_sr} Hz to {target_sr} Hz...")
372
  waveform = torchaudio.functional.resample(
@@ -375,24 +457,49 @@ def preprocess_audio(audio_path: str, target_sr: int = 16000) -> tuple:
375
  new_freq=target_sr
376
  )
377
  print(f"βœ… Resampled to {target_sr} Hz")
378
- else:
379
- print(f"βœ… Audio already at target {target_sr} Hz")
380
-
381
- print(f"βœ… Final preprocessed audio: {waveform.shape} at {target_sr} Hz")
382
  return waveform, target_sr
 
 
 
 
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  except Exception as e:
385
  error_msg = f"❌ Error in audio preprocessing: {str(e)}"
386
  print(error_msg)
387
  raise Exception(error_msg)
388
 
389
- finally:
390
- if temp_mp3_path and os.path.exists(temp_mp3_path):
391
- if temp_mp3_path and os.path.exists(temp_mp3_path) and "temp" in temp_mp3_path:
392
- os.unlink(temp_mp3_path)
393
-
394
  # --- GLOBAL MODEL STORAGE ---
395
- # Global variables to store loaded models (acts as cache)
396
  models = {}
397
  qa_system = {}
398
 
@@ -400,7 +507,7 @@ def load_models():
400
  """Load all models with caching using global variables."""
401
  global models
402
 
403
- if models: # Already loaded
404
  print("βœ… Models already loaded from cache")
405
  return models
406
 
@@ -461,12 +568,11 @@ def load_models():
461
  models['punctuation_model'] = None
462
  models['punctuation_id2label'] = None
463
 
464
- # Load IndicTrans2 model using the proper IndicTransToolkit
465
  try:
466
  print("πŸ”„ Loading IndicTrans2 for translation...")
467
  model_name = "ai4bharat/indictrans2-indic-en-1B"
468
 
469
- # Load tokenizer and model
470
  models['indictrans_tokenizer'] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
471
  models['indictrans_model'] = AutoModelForSeq2SeqLM.from_pretrained(
472
  model_name,
@@ -474,13 +580,16 @@ def load_models():
474
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
475
  ).to(device)
476
 
477
- # Initialize IndicProcessor
478
- models['indic_processor'] = IndicProcessor(inference=True)
479
-
480
- print("βœ… IndicTrans2 loaded successfully - Ready for translation!")
 
 
 
 
481
  except Exception as e:
482
  print(f"❌ Error loading IndicTrans2 model: {e}")
483
- print("⚠️ Translation functionality will be limited")
484
  models['indictrans_tokenizer'] = None
485
  models['indictrans_model'] = None
486
  models['indic_processor'] = None
@@ -491,14 +600,13 @@ def load_qa_system():
491
  """Load Q&A system with caching using global variables."""
492
  global qa_system
493
 
494
- if qa_system: # Already loaded
495
  print("βœ… Q&A system already loaded from cache")
496
  return qa_system
497
 
498
  print("πŸš€ Loading Q&A system for the first time...")
499
 
500
  try:
501
- # Load your Q&A dataset
502
  if os.path.exists("cleaned_qa_dataset.xlsx"):
503
  df = pd.read_excel("cleaned_qa_dataset.xlsx")
504
  qa_pairs = df[['Question', 'Answer']].dropna().drop_duplicates().reset_index(drop=True)
@@ -621,7 +729,7 @@ def detect_language_with_whisper(audio_path):
621
 
622
  def translate_with_indictrans2(text: str, source_lang: str = "hin_Deva") -> Dict:
623
  """
624
- Translate Indic language text to English using IndicTransToolkit and IndicTrans2 model.
625
  """
626
  try:
627
  if not models.get('indictrans_model') or not models.get('indictrans_tokenizer') or not models.get('indic_processor'):
@@ -631,9 +739,8 @@ def translate_with_indictrans2(text: str, source_lang: str = "hin_Deva") -> Dict
631
  "translated_text": ""
632
  }
633
 
634
- print(f"πŸ”„ Translating with IndicTransToolkit: {source_lang} -> eng_Latn")
635
 
636
- # Prepare input sentences
637
  input_sentences = [text.strip()]
638
 
639
  # Preprocess with IndicProcessor
@@ -670,7 +777,7 @@ def translate_with_indictrans2(text: str, source_lang: str = "hin_Deva") -> Dict
670
  clean_up_tokenization_spaces=True,
671
  )
672
 
673
- # Postprocess the translations using IndicProcessor
674
  translations = models['indic_processor'].postprocess_batch(generated_tokens, lang="eng_Latn")
675
 
676
  translated_text = translations[0] if translations else ""
@@ -699,13 +806,8 @@ def semantic_qa_search(user_question, similarity_threshold=0.3, top_k=3):
699
  }
700
 
701
  try:
702
- # Encode the user question
703
  user_question_embedding = qa_system['model'].encode(user_question, convert_to_tensor=True)
704
-
705
- # Compute cosine similarity
706
  similarities = util.cos_sim(user_question_embedding, qa_system['question_embeddings'])
707
-
708
- # Get top k most similar questions
709
  top_results = torch.topk(similarities, k=top_k)
710
 
711
  results = []
@@ -770,12 +872,10 @@ def transcribe_audio_with_lid(audio_path):
770
  print("πŸ‡ΊπŸ‡Έ Processing as English audio...")
771
  detected_lang_str = "Detected Language: English (Whisper Detection)"
772
 
773
- # Add punctuation to Whisper transcription
774
  punctuated_transcription = add_punctuation(whisper_transcription)
775
  print(f"Original Whisper: {whisper_transcription}")
776
  print(f"With punctuation: {punctuated_transcription}")
777
 
778
- # For English, translation is the same as transcription
779
  translation_result = punctuated_transcription
780
 
781
  return (
@@ -793,7 +893,6 @@ def transcribe_audio_with_lid(audio_path):
793
 
794
  print("πŸ” Using MMS-LID for detailed language identification...")
795
 
796
- # Language detection using MMS-LID for non-English
797
  inputs = models['lid_processor'](waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
798
  with torch.no_grad():
799
  outputs = models['lid_model'](**inputs)
@@ -814,25 +913,23 @@ def transcribe_audio_with_lid(audio_path):
814
  if not models.get('asr_model'):
815
  return "ASR model not available.", "", ""
816
 
817
- # Use IndicConformer for non-English transcription
818
  print(f"πŸ”€ Transcribing with IndicConformer ({detected_lang_name})...")
819
  with torch.no_grad():
820
  transcription = models['asr_model'](waveform_16k.to(device), asr_lang_code, "rnnt")
821
  print("βœ… IndicConformer transcription complete.")
822
 
823
- # Add punctuation to transcription
824
  punctuated_transcription = add_punctuation(transcription.strip()) if transcription else ""
825
  print(f"Original: {transcription}")
826
  print(f"With punctuation: {punctuated_transcription}")
827
 
828
- # Translation to English using IndicTrans2 with IndicTransToolkit
829
  translation_result = ""
830
  translation_error = ""
831
 
832
  if punctuated_transcription:
833
  indictrans_lang_code = ASR_TO_INDICTRANS_MAP.get(asr_lang_code)
834
  if indictrans_lang_code:
835
- print(f"πŸ”„ Translating {detected_lang_name} to English with IndicTransToolkit...")
836
  translation_response = translate_with_indictrans2(
837
  punctuated_transcription,
838
  indictrans_lang_code
@@ -840,7 +937,7 @@ def transcribe_audio_with_lid(audio_path):
840
 
841
  if translation_response["success"]:
842
  translation_result = translation_response["translated_text"]
843
- print("βœ… IndicTransToolkit translation complete.")
844
  else:
845
  translation_error = translation_response["error"]
846
  translation_result = "Translation failed"
@@ -851,7 +948,6 @@ def transcribe_audio_with_lid(audio_path):
851
  else:
852
  translation_result = "No text to translate"
853
 
854
- # Combine results
855
  if translation_error:
856
  translation_display = f"❌ {translation_result}\nError: {translation_error}"
857
  else:
@@ -870,10 +966,8 @@ def process_audio_and_search(audio_path):
870
  """Process audio and perform semantic search."""
871
  print(f"--- Processing audio file with Whisper-first pipeline: {audio_path} ---")
872
 
873
- # Process audio
874
  detected_language, transcription, translated_text = transcribe_audio_with_lid(audio_path)
875
 
876
- # Check for errors
877
  if "Error" in detected_language:
878
  return {
879
  "status": "audio_processing_failed",
@@ -899,27 +993,21 @@ def gradio_interface_fn(audio_path):
899
  if not audio_path:
900
  return "No audio file provided", "", "", "Please upload an audio file."
901
 
902
- # Call the integrated workflow function
903
  integrated_result = process_audio_and_search(audio_path)
904
 
905
- # Initialize output variables
906
  detected_language_output = ""
907
  transcription_output = ""
908
  translated_text_output = ""
909
  semantic_search_output_string = ""
910
 
911
- # Check the status of the result
912
  if integrated_result["status"] == "success":
913
- # Extract audio processing results
914
  audio_processing = integrated_result["audio_processing"]
915
  detected_language_output = audio_processing["detected_language"]
916
  transcription_output = audio_processing["transcription"]
917
  translated_text_output = audio_processing["translated_text"]
918
 
919
- # Extract semantic search results
920
  semantic_search = integrated_result["semantic_search"]
921
 
922
- # Format semantic search output to show top 3 results
923
  if semantic_search["status"] == "success":
924
  semantic_search_output_string = "--- Top 3 Semantic Search Results ---\n\n"
925
  for result in semantic_search["results"]:
@@ -938,45 +1026,49 @@ def gradio_interface_fn(audio_path):
938
  )
939
 
940
  else:
941
- # Handle audio processing failure
942
  error_message = integrated_result.get("error", "An unknown error occurred during audio processing.")
943
  detected_language_output = f"Error: {error_message}"
944
  transcription_output = "N/A"
945
  translated_text_output = "N/A"
946
  semantic_search_output_string = "Semantic search could not be performed due to audio processing error."
947
 
948
- # Return the formatted outputs as a tuple
949
  return (detected_language_output, transcription_output, translated_text_output, semantic_search_output_string)
950
 
951
  def create_gradio_app():
952
  """Create the Gradio interface."""
953
 
954
- # Define input component for audio file
955
  audio_input = gr.Audio(type="filepath", label="Upload Audio File")
956
-
957
- # Define output components for audio processing results
958
  detected_language_output = gr.Textbox(label="Detected Language")
959
  transcription_output = gr.Textbox(label="Transcription")
960
  translated_text_output = gr.Textbox(label="Translated Text")
961
  semantic_search_output = gr.Textbox(label="Semantic Search Results")
962
 
963
- # Create the interface
 
 
 
 
 
 
 
964
  iface = gr.Interface(
965
  fn=gradio_interface_fn,
966
  inputs=audio_input,
967
  outputs=[detected_language_output, transcription_output, translated_text_output, semantic_search_output],
968
  title="🌾 Multilingual Agricultural Voice Assistant",
969
- description="""
970
  Upload an audio file in English or any of the 22+ supported Indic languages.
971
  The system will:
972
  1. 🎧 Detect the language automatically
973
  2. πŸ“ Transcribe the speech with punctuation
974
- 3. 🌍 Translate to English using **IndicTransToolkit + IndicTrans2**
975
  4. πŸ” Find relevant agricultural answers from the knowledge base
976
 
977
  **Supported Languages:** English, Hindi, Bengali, Telugu, Tamil, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Odia, Assamese, Urdu, Nepali, Sanskrit, and more!
978
 
979
- **πŸ”§ Translation**: IndicTransToolkit with IndicTrans2 model (official implementation)
 
 
980
  """,
981
  examples=[],
982
  theme=gr.themes.Soft(),
@@ -990,23 +1082,30 @@ if __name__ == "__main__":
990
  print("\n" + "="*60)
991
  print("🌾 MULTILINGUAL AGRICULTURAL VOICE ASSISTANT")
992
  print("="*60)
993
- print("πŸ”§ Translation: IndicTransToolkit + IndicTrans2 Model")
 
 
 
 
 
 
 
 
 
994
  print("🎯 Features available:")
995
  print(" β€’ Multi-format audio processing (15+ formats)")
996
  print(" β€’ Whisper-based English detection and transcription")
997
  print(" β€’ MMS-LID for 22+ Indic language detection")
998
  print(" β€’ IndicConformer for Indic language ASR")
999
  print(" β€’ Bidirectional Gemma3 punctuation (31 punctuation types)")
1000
- print(" β€’ IndicTransToolkit for professional-grade translation")
1001
  print(" β€’ Semantic Q&A search")
1002
  print("="*60)
1003
 
1004
- # Load models on startup
1005
  print("πŸš€ Loading models...")
1006
  models = load_models()
1007
  qa_system = load_qa_system()
1008
 
1009
- # Create and launch the Gradio app
1010
  print("πŸŽͺ Launching Gradio interface...")
1011
  app = create_gradio_app()
1012
  app.launch()
 
3
 
4
  """
5
  Multilingual Voice-Based Agricultural Recommendation System
6
+ Updated for TorchAudio 2.8+ deprecations and TorchCodec migration
7
  Optimized for Hugging Face Spaces deployment with Whisper-first pipeline
 
8
  """
9
 
10
  from __future__ import annotations
11
  import torch
12
+ import warnings
13
  import json
14
  import os
15
  import re
 
21
  from transformers import AutoModel, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
22
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
23
  from transformers import AutoTokenizer, PretrainedConfig, PreTrainedModel
24
+ from transformers import AutoModelForSeq2SeqLM
 
25
  from pathlib import Path
26
  import torch.nn as nn
27
  from transformers import Gemma3ForCausalLM, Gemma3TextConfig
 
33
  from transformers.modeling_outputs import TokenClassifierOutput
34
  from transformers.utils import logging
35
  from sentence_transformers import SentenceTransformer, util
36
+ import librosa # Alternative to torchaudio
37
+ import soundfile as sf # Alternative audio loading
38
+
39
+ # Try to import TorchCodec and TorchAudio with fallbacks
40
+ try:
41
+ import torchcodec
42
+ from torchcodec import AudioDecoder
43
+ TORCHCODEC_AVAILABLE = True
44
+ print("βœ… TorchCodec available - using new audio loading")
45
+ except ImportError:
46
+ TORCHCODEC_AVAILABLE = False
47
+ print("⚠️ TorchCodec not available - using fallback methods")
48
+
49
+ try:
50
+ import torchaudio
51
+ # Suppress TorchAudio deprecation warnings for backends
52
+ warnings.filterwarnings("ignore", category=UserWarning, module="torchaudio")
53
+ TORCHAUDIO_AVAILABLE = True
54
+ print("βœ… TorchAudio available - with deprecation handling")
55
+ except ImportError:
56
+ TORCHAUDIO_AVAILABLE = False
57
+ torchaudio = None
58
+ print("⚠️ TorchAudio not available - using librosa fallback")
59
+
60
+ try:
61
+ from IndicTransToolkit.processor import IndicProcessor
62
+ INDICTRANS_TOOLKIT_AVAILABLE = True
63
+ print("βœ… IndicTransToolkit available")
64
+ except ImportError:
65
+ INDICTRANS_TOOLKIT_AVAILABLE = False
66
+ print("⚠️ IndicTransToolkit not available - using basic preprocessing")
67
 
68
  logger = logging.get_logger(__name__)
69
  device = "cuda" if torch.cuda.is_available() else "cpu"
70
 
71
  # --- CONFIGURATION ---
72
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
73
+
74
+ if HF_TOKEN:
75
+ from huggingface_hub import login
76
+ try:
77
+ login(HF_TOKEN)
78
+ print("βœ… Successfully logged in to Hugging Face!")
79
+ except Exception as e:
80
+ print(f"⚠️ HF login failed: {e}")
81
+
82
+ # --- FALLBACK INDIC PROCESSOR FOR WHEN TOOLKIT IS NOT AVAILABLE ---
83
+ class BasicIndicProcessor:
84
+ """Basic fallback processor when IndicTransToolkit is not available"""
85
+ def __init__(self, inference=True):
86
+ self.inference = inference
87
+
88
+ def preprocess_batch(self, sentences, src_lang, tgt_lang):
89
+ """Basic preprocessing - add language tokens"""
90
+ processed_sentences = []
91
+ for sentence in sentences:
92
+ processed_sentence = f"<2{tgt_lang}> {sentence.strip()}"
93
+ processed_sentences.append(processed_sentence)
94
+ return processed_sentences
95
+
96
+ def postprocess_batch(self, sentences, lang):
97
+ """Basic postprocessing - remove special tokens"""
98
+ processed_sentences = []
99
+ for sentence in sentences:
100
+ processed_sentence = sentence.strip()
101
+ if processed_sentence.startswith('<2'):
102
+ processed_sentence = processed_sentence.split('>', 1)[-1].strip()
103
+ processed_sentences.append(processed_sentence)
104
+ return processed_sentences
105
 
106
  # --- CUSTOM GEMMA3 BIDIRECTIONAL MODEL FOR PUNCTUATION ---
107
  class Gemma3PunctuationConfig(Gemma3TextConfig):
 
325
  def detect_audio_format(audio_path: str) -> str:
326
  return Path(audio_path).suffix.lower()
327
 
328
+ def load_audio_torchcodec(audio_path: str, target_sr: int = 16000) -> tuple:
329
+ """Load audio using TorchCodec (new recommended method)"""
330
  try:
331
+ print(f"πŸ”§ Loading audio with TorchCodec: {audio_path}")
332
+
333
+ # Use TorchCodec AudioDecoder
334
+ decoder = AudioDecoder(audio_path)
335
+
336
+ # Get audio info
337
+ metadata = decoder.metadata
338
+ original_sr = int(metadata.sample_rate)
339
+
340
+ # Decode audio
341
+ audio_data = decoder.decode() # Returns tensor
342
+ waveform = audio_data.audio # Get audio tensor
343
+
344
+ print(f"🎡 TorchCodec loaded audio: {waveform.shape} at {original_sr} Hz")
345
+
346
+ # Convert to mono if stereo
347
+ if waveform.shape[0] > 1:
348
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
349
+ print(f"πŸ”„ Converted from stereo to mono")
350
+
351
+ # Resample if needed
352
+ if original_sr != target_sr:
353
+ print(f"πŸ”„ Resampling from {original_sr} Hz to {target_sr} Hz...")
354
+ # Use torchaudio functional for resampling (still available)
355
+ if TORCHAUDIO_AVAILABLE:
356
+ waveform = torchaudio.functional.resample(
357
+ waveform,
358
+ orig_freq=original_sr,
359
+ new_freq=target_sr
360
+ )
361
+ else:
362
+ # Fallback to librosa
363
+ waveform_np = waveform.numpy()
364
+ waveform_resampled = librosa.resample(
365
+ waveform_np[0],
366
+ orig_sr=original_sr,
367
+ target_sr=target_sr
368
+ )
369
+ waveform = torch.tensor(waveform_resampled).unsqueeze(0)
370
+ print(f"βœ… Resampled to {target_sr} Hz")
371
+
372
+ print(f"βœ… TorchCodec final audio: {waveform.shape} at {target_sr} Hz")
373
+ return waveform, target_sr
374
+
375
+ except Exception as e:
376
+ print(f"❌ TorchCodec loading failed: {e}")
377
+ raise e
378
 
379
+ def load_audio_librosa(audio_path: str, target_sr: int = 16000) -> tuple:
380
+ """Load audio using librosa (fallback method)"""
381
  try:
382
+ print(f"πŸ”§ Loading audio with librosa: {audio_path}")
383
+
384
+ # Load with librosa
385
+ waveform_np, sr = librosa.load(audio_path, sr=target_sr, mono=True)
386
+
387
+ # Convert to torch tensor and add channel dimension
388
+ waveform = torch.tensor(waveform_np).unsqueeze(0)
389
+
390
+ print(f"βœ… Librosa loaded audio: {waveform.shape} at {target_sr} Hz")
391
+ return waveform, target_sr
392
+
393
+ except Exception as e:
394
+ print(f"❌ Librosa loading failed: {e}")
395
+ raise e
396
 
397
+ def load_audio_torchaudio_legacy(audio_path: str, target_sr: int = 16000) -> tuple:
398
+ """Load audio using legacy TorchAudio (with backend handling)"""
399
+ try:
400
+ print(f"πŸ”§ Loading audio with TorchAudio (legacy): {audio_path}")
401
+
402
+ # Try different backends
403
+ backends_to_try = []
404
+
405
+ if TORCHAUDIO_AVAILABLE:
406
+ try:
407
+ # Suppress the deprecation warning temporarily
408
+ with warnings.catch_warnings():
409
+ warnings.simplefilter("ignore")
410
+ available_backends = torchaudio.list_audio_backends()
411
+ backends_to_try = available_backends
412
+ except Exception:
413
+ backends_to_try = ['soundfile', 'sox_io']
414
+
415
+ audio_format = detect_audio_format(audio_path)
416
+ print(f"🎡 Audio format: {audio_format}")
417
+ print(f"πŸ”§ Available backends: {backends_to_try}")
418
+
419
+ waveform = None
420
+ orig_sr = None
421
+
422
+ # Try to load with different backends
423
+ for backend in backends_to_try + [None]: # None for default
424
  try:
 
425
  if backend:
426
+ print(f"πŸ”„ Trying {backend} backend...")
427
+ if hasattr(torchaudio, 'set_audio_backend'):
428
+ torchaudio.set_audio_backend(backend)
429
  waveform, orig_sr = torchaudio.load(audio_path, backend=backend)
430
  else:
431
+ print(f"πŸ”„ Trying default backend...")
432
  waveform, orig_sr = torchaudio.load(audio_path)
433
+
434
+ print(f"βœ… Successfully loaded with {backend or 'default'} backend")
435
+ break
436
+
437
+ except Exception as e:
438
+ print(f"❌ {backend or 'default'} backend failed: {e}")
439
+ continue
440
+
441
+ if waveform is None:
442
+ raise Exception("All TorchAudio backends failed")
443
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  print(f"🎡 Loaded audio: {waveform.shape} at {orig_sr} Hz")
445
+
446
+ # Convert to mono
447
+ if waveform.shape[0] > 1:
448
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
449
+ print(f"πŸ”„ Converted from stereo to mono")
450
+
451
+ # Resample if needed
452
  if orig_sr != target_sr:
453
  print(f"πŸ”„ Resampling from {orig_sr} Hz to {target_sr} Hz...")
454
  waveform = torchaudio.functional.resample(
 
457
  new_freq=target_sr
458
  )
459
  print(f"βœ… Resampled to {target_sr} Hz")
460
+
 
 
 
461
  return waveform, target_sr
462
+
463
+ except Exception as e:
464
+ print(f"❌ TorchAudio legacy loading failed: {e}")
465
+ raise e
466
 
467
+ def preprocess_audio(audio_path: str, target_sr: int = 16000) -> tuple:
468
+ """
469
+ Preprocess audio with multiple fallback methods for TorchAudio 2.8+ compatibility
470
+ """
471
+ try:
472
+ original_audio_format = detect_audio_format(audio_path)
473
+ print(f"🎡 Detected original format: {original_audio_format}")
474
+
475
+ # Method 1: Try TorchCodec (recommended for future)
476
+ if TORCHCODEC_AVAILABLE:
477
+ try:
478
+ return load_audio_torchcodec(audio_path, target_sr)
479
+ except Exception as e:
480
+ print(f"⚠️ TorchCodec failed: {e}")
481
+
482
+ # Method 2: Try TorchAudio legacy (with deprecation handling)
483
+ if TORCHAUDIO_AVAILABLE:
484
+ try:
485
+ return load_audio_torchaudio_legacy(audio_path, target_sr)
486
+ except Exception as e:
487
+ print(f"⚠️ TorchAudio legacy failed: {e}")
488
+
489
+ # Method 3: Fallback to librosa
490
+ try:
491
+ return load_audio_librosa(audio_path, target_sr)
492
+ except Exception as e:
493
+ print(f"⚠️ Librosa fallback failed: {e}")
494
+
495
+ raise Exception("All audio loading methods failed")
496
+
497
  except Exception as e:
498
  error_msg = f"❌ Error in audio preprocessing: {str(e)}"
499
  print(error_msg)
500
  raise Exception(error_msg)
501
 
 
 
 
 
 
502
  # --- GLOBAL MODEL STORAGE ---
 
503
  models = {}
504
  qa_system = {}
505
 
 
507
  """Load all models with caching using global variables."""
508
  global models
509
 
510
+ if models:
511
  print("βœ… Models already loaded from cache")
512
  return models
513
 
 
568
  models['punctuation_model'] = None
569
  models['punctuation_id2label'] = None
570
 
571
+ # Load IndicTrans2 model
572
  try:
573
  print("πŸ”„ Loading IndicTrans2 for translation...")
574
  model_name = "ai4bharat/indictrans2-indic-en-1B"
575
 
 
576
  models['indictrans_tokenizer'] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
577
  models['indictrans_model'] = AutoModelForSeq2SeqLM.from_pretrained(
578
  model_name,
 
580
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
581
  ).to(device)
582
 
583
+ # Use IndicTransToolkit if available, otherwise use basic processor
584
+ if INDICTRANS_TOOLKIT_AVAILABLE:
585
+ models['indic_processor'] = IndicProcessor(inference=True)
586
+ print("βœ… IndicTrans2 loaded with IndicTransToolkit")
587
+ else:
588
+ models['indic_processor'] = BasicIndicProcessor(inference=True)
589
+ print("βœ… IndicTrans2 loaded with basic processor")
590
+
591
  except Exception as e:
592
  print(f"❌ Error loading IndicTrans2 model: {e}")
 
593
  models['indictrans_tokenizer'] = None
594
  models['indictrans_model'] = None
595
  models['indic_processor'] = None
 
600
  """Load Q&A system with caching using global variables."""
601
  global qa_system
602
 
603
+ if qa_system:
604
  print("βœ… Q&A system already loaded from cache")
605
  return qa_system
606
 
607
  print("πŸš€ Loading Q&A system for the first time...")
608
 
609
  try:
 
610
  if os.path.exists("cleaned_qa_dataset.xlsx"):
611
  df = pd.read_excel("cleaned_qa_dataset.xlsx")
612
  qa_pairs = df[['Question', 'Answer']].dropna().drop_duplicates().reset_index(drop=True)
 
729
 
730
  def translate_with_indictrans2(text: str, source_lang: str = "hin_Deva") -> Dict:
731
  """
732
+ Translate Indic language text to English using IndicTrans2 model.
733
  """
734
  try:
735
  if not models.get('indictrans_model') or not models.get('indictrans_tokenizer') or not models.get('indic_processor'):
 
739
  "translated_text": ""
740
  }
741
 
742
+ print(f"πŸ”„ Translating with IndicTrans2: {source_lang} -> eng_Latn")
743
 
 
744
  input_sentences = [text.strip()]
745
 
746
  # Preprocess with IndicProcessor
 
777
  clean_up_tokenization_spaces=True,
778
  )
779
 
780
+ # Postprocess the translations
781
  translations = models['indic_processor'].postprocess_batch(generated_tokens, lang="eng_Latn")
782
 
783
  translated_text = translations[0] if translations else ""
 
806
  }
807
 
808
  try:
 
809
  user_question_embedding = qa_system['model'].encode(user_question, convert_to_tensor=True)
 
 
810
  similarities = util.cos_sim(user_question_embedding, qa_system['question_embeddings'])
 
 
811
  top_results = torch.topk(similarities, k=top_k)
812
 
813
  results = []
 
872
  print("πŸ‡ΊπŸ‡Έ Processing as English audio...")
873
  detected_lang_str = "Detected Language: English (Whisper Detection)"
874
 
 
875
  punctuated_transcription = add_punctuation(whisper_transcription)
876
  print(f"Original Whisper: {whisper_transcription}")
877
  print(f"With punctuation: {punctuated_transcription}")
878
 
 
879
  translation_result = punctuated_transcription
880
 
881
  return (
 
893
 
894
  print("πŸ” Using MMS-LID for detailed language identification...")
895
 
 
896
  inputs = models['lid_processor'](waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
897
  with torch.no_grad():
898
  outputs = models['lid_model'](**inputs)
 
913
  if not models.get('asr_model'):
914
  return "ASR model not available.", "", ""
915
 
 
916
  print(f"πŸ”€ Transcribing with IndicConformer ({detected_lang_name})...")
917
  with torch.no_grad():
918
  transcription = models['asr_model'](waveform_16k.to(device), asr_lang_code, "rnnt")
919
  print("βœ… IndicConformer transcription complete.")
920
 
 
921
  punctuated_transcription = add_punctuation(transcription.strip()) if transcription else ""
922
  print(f"Original: {transcription}")
923
  print(f"With punctuation: {punctuated_transcription}")
924
 
925
+ # Translation to English using IndicTrans2
926
  translation_result = ""
927
  translation_error = ""
928
 
929
  if punctuated_transcription:
930
  indictrans_lang_code = ASR_TO_INDICTRANS_MAP.get(asr_lang_code)
931
  if indictrans_lang_code:
932
+ print(f"πŸ”„ Translating {detected_lang_name} to English with IndicTrans2...")
933
  translation_response = translate_with_indictrans2(
934
  punctuated_transcription,
935
  indictrans_lang_code
 
937
 
938
  if translation_response["success"]:
939
  translation_result = translation_response["translated_text"]
940
+ print("βœ… IndicTrans2 translation complete.")
941
  else:
942
  translation_error = translation_response["error"]
943
  translation_result = "Translation failed"
 
948
  else:
949
  translation_result = "No text to translate"
950
 
 
951
  if translation_error:
952
  translation_display = f"❌ {translation_result}\nError: {translation_error}"
953
  else:
 
966
  """Process audio and perform semantic search."""
967
  print(f"--- Processing audio file with Whisper-first pipeline: {audio_path} ---")
968
 
 
969
  detected_language, transcription, translated_text = transcribe_audio_with_lid(audio_path)
970
 
 
971
  if "Error" in detected_language:
972
  return {
973
  "status": "audio_processing_failed",
 
993
  if not audio_path:
994
  return "No audio file provided", "", "", "Please upload an audio file."
995
 
 
996
  integrated_result = process_audio_and_search(audio_path)
997
 
 
998
  detected_language_output = ""
999
  transcription_output = ""
1000
  translated_text_output = ""
1001
  semantic_search_output_string = ""
1002
 
 
1003
  if integrated_result["status"] == "success":
 
1004
  audio_processing = integrated_result["audio_processing"]
1005
  detected_language_output = audio_processing["detected_language"]
1006
  transcription_output = audio_processing["transcription"]
1007
  translated_text_output = audio_processing["translated_text"]
1008
 
 
1009
  semantic_search = integrated_result["semantic_search"]
1010
 
 
1011
  if semantic_search["status"] == "success":
1012
  semantic_search_output_string = "--- Top 3 Semantic Search Results ---\n\n"
1013
  for result in semantic_search["results"]:
 
1026
  )
1027
 
1028
  else:
 
1029
  error_message = integrated_result.get("error", "An unknown error occurred during audio processing.")
1030
  detected_language_output = f"Error: {error_message}"
1031
  transcription_output = "N/A"
1032
  translated_text_output = "N/A"
1033
  semantic_search_output_string = "Semantic search could not be performed due to audio processing error."
1034
 
 
1035
  return (detected_language_output, transcription_output, translated_text_output, semantic_search_output_string)
1036
 
1037
  def create_gradio_app():
1038
  """Create the Gradio interface."""
1039
 
 
1040
  audio_input = gr.Audio(type="filepath", label="Upload Audio File")
 
 
1041
  detected_language_output = gr.Textbox(label="Detected Language")
1042
  transcription_output = gr.Textbox(label="Transcription")
1043
  translated_text_output = gr.Textbox(label="Translated Text")
1044
  semantic_search_output = gr.Textbox(label="Semantic Search Results")
1045
 
1046
+ audio_backend_info = ""
1047
+ if TORCHCODEC_AVAILABLE:
1048
+ audio_backend_info = "🎡 **Audio Backend**: TorchCodec (recommended)"
1049
+ elif TORCHAUDIO_AVAILABLE:
1050
+ audio_backend_info = "🎡 **Audio Backend**: TorchAudio (legacy with deprecation handling)"
1051
+ else:
1052
+ audio_backend_info = "🎡 **Audio Backend**: Librosa (fallback)"
1053
+
1054
  iface = gr.Interface(
1055
  fn=gradio_interface_fn,
1056
  inputs=audio_input,
1057
  outputs=[detected_language_output, transcription_output, translated_text_output, semantic_search_output],
1058
  title="🌾 Multilingual Agricultural Voice Assistant",
1059
+ description=f"""
1060
  Upload an audio file in English or any of the 22+ supported Indic languages.
1061
  The system will:
1062
  1. 🎧 Detect the language automatically
1063
  2. πŸ“ Transcribe the speech with punctuation
1064
+ 3. 🌍 Translate to English using **IndicTrans2**
1065
  4. πŸ” Find relevant agricultural answers from the knowledge base
1066
 
1067
  **Supported Languages:** English, Hindi, Bengali, Telugu, Tamil, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Odia, Assamese, Urdu, Nepali, Sanskrit, and more!
1068
 
1069
+ {audio_backend_info}
1070
+ **πŸ”§ Translation**: IndicTrans2 with robust preprocessing
1071
+ **⚠️ Note**: Updated for TorchAudio 2.8+ deprecations
1072
  """,
1073
  examples=[],
1074
  theme=gr.themes.Soft(),
 
1082
  print("\n" + "="*60)
1083
  print("🌾 MULTILINGUAL AGRICULTURAL VOICE ASSISTANT")
1084
  print("="*60)
1085
+
1086
+ if TORCHCODEC_AVAILABLE:
1087
+ print("🎡 Audio Backend: TorchCodec (recommended)")
1088
+ elif TORCHAUDIO_AVAILABLE:
1089
+ print("🎡 Audio Backend: TorchAudio (legacy with deprecation handling)")
1090
+ else:
1091
+ print("🎡 Audio Backend: Librosa (fallback)")
1092
+
1093
+ print("πŸ”§ Translation: IndicTrans2 Model")
1094
+ print("⚠️ Updated for TorchAudio 2.8+ deprecations")
1095
  print("🎯 Features available:")
1096
  print(" β€’ Multi-format audio processing (15+ formats)")
1097
  print(" β€’ Whisper-based English detection and transcription")
1098
  print(" β€’ MMS-LID for 22+ Indic language detection")
1099
  print(" β€’ IndicConformer for Indic language ASR")
1100
  print(" β€’ Bidirectional Gemma3 punctuation (31 punctuation types)")
1101
+ print(" β€’ IndicTrans2 for professional translation")
1102
  print(" β€’ Semantic Q&A search")
1103
  print("="*60)
1104
 
 
1105
  print("πŸš€ Loading models...")
1106
  models = load_models()
1107
  qa_system = load_qa_system()
1108
 
 
1109
  print("πŸŽͺ Launching Gradio interface...")
1110
  app = create_gradio_app()
1111
  app.launch()