Spaces:

vicharanashala
/

Voice_recommendation_system

Sleeping

App Files Files Community

Noumida commited on Sep 16

Commit

418029a

verified ·

1 Parent(s): 6d9d341

Update app.py

Browse files

Files changed (1) hide show

app.py +257 -158

app.py CHANGED Viewed

@@ -3,13 +3,13 @@
 """
 Multilingual Voice-Based Agricultural Recommendation System
 Optimized for Hugging Face Spaces deployment with Whisper-first pipeline
-Now using IndicTransToolkit for proper IndicTrans2 implementation
 """
 from __future__ import annotations
 import torch
-import torchaudio
 import json
 import os
 import re
@@ -21,8 +21,7 @@ from typing import List, Dict, Optional, Union
 from transformers import AutoModel, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from transformers import AutoTokenizer, PretrainedConfig, PreTrainedModel
-from transformers import AutoModelForSeq2SeqLM  # Added for IndicTrans2
-from IndicTransToolkit.processor import IndicProcessor  # Official IndicTrans2 processor
 from pathlib import Path
 import torch.nn as nn
 from transformers import Gemma3ForCausalLM, Gemma3TextConfig
@@ -34,22 +33,75 @@ from transformers.models.gemma3.modeling_gemma3 import (
 from transformers.modeling_outputs import TokenClassifierOutput
 from transformers.utils import logging
 from sentence_transformers import SentenceTransformer, util
 logger = logging.get_logger(__name__)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # --- CONFIGURATION ---
-# Hugging Face Token (optional, can be set as environment variable)
-# HF_TOKEN = os.getenv("HF_TOKEN", "")
-# # Login to Hugging Face if token is provided
-# if HF_TOKEN:
-#     from huggingface_hub import login
-#     try:
-#         login(HF_TOKEN)
-#         print("✅ Successfully logged in to Hugging Face!")
-#     except Exception as e:
-#         print(f"⚠️ HF login failed: {e}")
 # --- CUSTOM GEMMA3 BIDIRECTIONAL MODEL FOR PUNCTUATION ---
 class Gemma3PunctuationConfig(Gemma3TextConfig):
@@ -273,100 +325,130 @@ SUPPORTED_AUDIO_FORMATS = {
 def detect_audio_format(audio_path: str) -> str:
     return Path(audio_path).suffix.lower()
-def get_optimal_backend(audio_format: str) -> str:
-    ffmpeg_formats = {'.mp3', '.opus', '.m4a', '.aac', '.mp4', '.webm', '.3gp', '.mpeg'}
     try:
-        backends = torchaudio.list_audio_backends()
-        if 'ffmpeg' in backends and audio_format in ffmpeg_formats:
-            return 'ffmpeg'
-        elif 'sox_io' in backends:
-            return 'sox_io'
-        elif 'soundfile' in backends:
-            return 'soundfile'
-        else:
-            return None
-    except:
-        return None
-def convert_to_mono(waveform: torch.Tensor) -> torch.Tensor:
-    if waveform.shape[0] > 1:
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
-        print(f"🔄 Converted from stereo to mono")
-    else:
-        print("📻 Audio is already mono")
-    return waveform
-def preprocess_audio(audio_path: str, target_sr: int = 16000) -> tuple:
     try:
-        original_audio_format = detect_audio_format(audio_path)
-        print(f"🎵 Detected original format: {original_audio_format}")
-        current_audio_path = audio_path
-        temp_mp3_path = None
-        if original_audio_format in {'.mpeg', '.opus'}:
-            print(f"🔄 Converting {original_audio_format} to .mp3...")
-            temp_mp3_fd, temp_mp3_path = tempfile.mkstemp(suffix=".mp3", prefix="audio_converted_")
-            os.close(temp_mp3_fd)
             try:
-                backend = get_optimal_backend(original_audio_format)
                 if backend:
                     waveform, orig_sr = torchaudio.load(audio_path, backend=backend)
                 else:
                     waveform, orig_sr = torchaudio.load(audio_path)
-                torchaudio.save(temp_mp3_path, waveform, orig_sr, format="mp3")
-                current_audio_path = temp_mp3_path
-                print(f"✅ Converted to .mp3: {current_audio_path}")
-            except Exception as conversion_error:
-                print(f"❌ Error during conversion to .mp3: {conversion_error}")
-                print("⚠️ Could not convert to .mp3, attempting to process original file.")
-                current_audio_path = audio_path
-                if temp_mp3_path and os.path.exists(temp_mp3_path):
-                     os.unlink(temp_mp3_path)
-                temp_mp3_path = None
-        audio_format_to_process = detect_audio_format(current_audio_path)
-        print(f"🎵 Processing format: {audio_format_to_process}")
-        if audio_format_to_process not in SUPPORTED_AUDIO_FORMATS:
-            print(f"⚠️ Warning: {audio_format_to_process} may not be fully supported")
-        backend = get_optimal_backend(audio_format_to_process)
-        if backend:
-            print(f"🔧 Using {backend} backend for {audio_format_to_process}")
-        try:
-            if backend:
-                waveform, orig_sr = torchaudio.load(current_audio_path, backend=backend)
-            else:
-                waveform, orig_sr = torchaudio.load(current_audio_path)
-        except Exception as load_error:
-            print(f"⚠️ Primary load method failed: {str(load_error)}")
-            print("🔄 Trying alternative loading method...")
-            for fallback_backend in ['ffmpeg', 'sox_io', 'soundfile']:
-                try:
-                    backends = torchaudio.list_audio_backends()
-                    if fallback_backend in backends:
-                        print(f"🔄 Trying {fallback_backend} backend...")
-                        waveform, orig_sr = torchaudio.load(current_audio_path, backend=fallback_backend)
-                        print(f"✅ Successfully loaded with {fallback_backend} backend")
-                        break
-                except Exception as e:
-                    continue
-            else:
-                try:
-                    waveform, orig_sr = torchaudio.load(current_audio_path)
-                    print("✅ Loaded with default backend")
-                except Exception as final_error:
-                    raise Exception(f"Failed to load audio file with any backend: {final_error}")
         print(f"🎵 Loaded audio: {waveform.shape} at {orig_sr} Hz")
-        waveform = convert_to_mono(waveform)
         if orig_sr != target_sr:
             print(f"🔄 Resampling from {orig_sr} Hz to {target_sr} Hz...")
             waveform = torchaudio.functional.resample(
@@ -375,24 +457,49 @@ def preprocess_audio(audio_path: str, target_sr: int = 16000) -> tuple:
                 new_freq=target_sr
             )
             print(f"✅ Resampled to {target_sr} Hz")
-        else:
-            print(f"✅ Audio already at target {target_sr} Hz")
-        print(f"✅ Final preprocessed audio: {waveform.shape} at {target_sr} Hz")
         return waveform, target_sr
     except Exception as e:
         error_msg = f"❌ Error in audio preprocessing: {str(e)}"
         print(error_msg)
         raise Exception(error_msg)
-    finally:
-        if temp_mp3_path and os.path.exists(temp_mp3_path):
-            if temp_mp3_path and os.path.exists(temp_mp3_path) and "temp" in temp_mp3_path:
-                os.unlink(temp_mp3_path)
 # --- GLOBAL MODEL STORAGE ---
-# Global variables to store loaded models (acts as cache)
 models = {}
 qa_system = {}
@@ -400,7 +507,7 @@ def load_models():
     """Load all models with caching using global variables."""
     global models
-    if models:  # Already loaded
         print("✅ Models already loaded from cache")
         return models
@@ -461,12 +568,11 @@ def load_models():
         models['punctuation_model'] = None
         models['punctuation_id2label'] = None
-    # Load IndicTrans2 model using the proper IndicTransToolkit
     try:
         print("🔄 Loading IndicTrans2 for translation...")
         model_name = "ai4bharat/indictrans2-indic-en-1B"
-        # Load tokenizer and model
         models['indictrans_tokenizer'] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         models['indictrans_model'] = AutoModelForSeq2SeqLM.from_pretrained(
             model_name,
@@ -474,13 +580,16 @@ def load_models():
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
         ).to(device)
-        # Initialize IndicProcessor
-        models['indic_processor'] = IndicProcessor(inference=True)
-        print("✅ IndicTrans2 loaded successfully - Ready for translation!")
     except Exception as e:
         print(f"❌ Error loading IndicTrans2 model: {e}")
-        print("⚠️ Translation functionality will be limited")
         models['indictrans_tokenizer'] = None
         models['indictrans_model'] = None
         models['indic_processor'] = None
@@ -491,14 +600,13 @@ def load_qa_system():
     """Load Q&A system with caching using global variables."""
     global qa_system
-    if qa_system:  # Already loaded
         print("✅ Q&A system already loaded from cache")
         return qa_system
     print("🚀 Loading Q&A system for the first time...")
     try:
-        # Load your Q&A dataset
         if os.path.exists("cleaned_qa_dataset.xlsx"):
             df = pd.read_excel("cleaned_qa_dataset.xlsx")
             qa_pairs = df[['Question', 'Answer']].dropna().drop_duplicates().reset_index(drop=True)
@@ -621,7 +729,7 @@ def detect_language_with_whisper(audio_path):
 def translate_with_indictrans2(text: str, source_lang: str = "hin_Deva") -> Dict:
     """
-    Translate Indic language text to English using IndicTransToolkit and IndicTrans2 model.
     """
     try:
         if not models.get('indictrans_model') or not models.get('indictrans_tokenizer') or not models.get('indic_processor'):
@@ -631,9 +739,8 @@ def translate_with_indictrans2(text: str, source_lang: str = "hin_Deva") -> Dict
                 "translated_text": ""
             }
-        print(f"🔄 Translating with IndicTransToolkit: {source_lang} -> eng_Latn")
-        # Prepare input sentences
         input_sentences = [text.strip()]
         # Preprocess with IndicProcessor
@@ -670,7 +777,7 @@ def translate_with_indictrans2(text: str, source_lang: str = "hin_Deva") -> Dict
             clean_up_tokenization_spaces=True,
         )
-        # Postprocess the translations using IndicProcessor
         translations = models['indic_processor'].postprocess_batch(generated_tokens, lang="eng_Latn")
         translated_text = translations[0] if translations else ""
@@ -699,13 +806,8 @@ def semantic_qa_search(user_question, similarity_threshold=0.3, top_k=3):
         }
     try:
-        # Encode the user question
         user_question_embedding = qa_system['model'].encode(user_question, convert_to_tensor=True)
-        # Compute cosine similarity
         similarities = util.cos_sim(user_question_embedding, qa_system['question_embeddings'])
-        # Get top k most similar questions
         top_results = torch.topk(similarities, k=top_k)
         results = []
@@ -770,12 +872,10 @@ def transcribe_audio_with_lid(audio_path):
             print("🇺🇸 Processing as English audio...")
             detected_lang_str = "Detected Language: English (Whisper Detection)"
-            # Add punctuation to Whisper transcription
             punctuated_transcription = add_punctuation(whisper_transcription)
             print(f"Original Whisper: {whisper_transcription}")
             print(f"With punctuation: {punctuated_transcription}")
-            # For English, translation is the same as transcription
             translation_result = punctuated_transcription
             return (
@@ -793,7 +893,6 @@ def transcribe_audio_with_lid(audio_path):
             print("🔍 Using MMS-LID for detailed language identification...")
-            # Language detection using MMS-LID for non-English
             inputs = models['lid_processor'](waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
                 outputs = models['lid_model'](**inputs)
@@ -814,25 +913,23 @@ def transcribe_audio_with_lid(audio_path):
             if not models.get('asr_model'):
                 return "ASR model not available.", "", ""
-            # Use IndicConformer for non-English transcription
             print(f"🔤 Transcribing with IndicConformer ({detected_lang_name})...")
             with torch.no_grad():
                 transcription = models['asr_model'](waveform_16k.to(device), asr_lang_code, "rnnt")
             print("✅ IndicConformer transcription complete.")
-            # Add punctuation to transcription
             punctuated_transcription = add_punctuation(transcription.strip()) if transcription else ""
             print(f"Original: {transcription}")
             print(f"With punctuation: {punctuated_transcription}")
-            # Translation to English using IndicTrans2 with IndicTransToolkit
             translation_result = ""
             translation_error = ""
             if punctuated_transcription:
                 indictrans_lang_code = ASR_TO_INDICTRANS_MAP.get(asr_lang_code)
                 if indictrans_lang_code:
-                    print(f"🔄 Translating {detected_lang_name} to English with IndicTransToolkit...")
                     translation_response = translate_with_indictrans2(
                         punctuated_transcription,
                         indictrans_lang_code
@@ -840,7 +937,7 @@ def transcribe_audio_with_lid(audio_path):
                     if translation_response["success"]:
                         translation_result = translation_response["translated_text"]
-                        print("✅ IndicTransToolkit translation complete.")
                     else:
                         translation_error = translation_response["error"]
                         translation_result = "Translation failed"
@@ -851,7 +948,6 @@ def transcribe_audio_with_lid(audio_path):
             else:
                 translation_result = "No text to translate"
-            # Combine results
             if translation_error:
                 translation_display = f"❌ {translation_result}\nError: {translation_error}"
             else:
@@ -870,10 +966,8 @@ def process_audio_and_search(audio_path):
     """Process audio and perform semantic search."""
     print(f"--- Processing audio file with Whisper-first pipeline: {audio_path} ---")
-    # Process audio
     detected_language, transcription, translated_text = transcribe_audio_with_lid(audio_path)
-    # Check for errors
     if "Error" in detected_language:
         return {
             "status": "audio_processing_failed",
@@ -899,27 +993,21 @@ def gradio_interface_fn(audio_path):
     if not audio_path:
         return "No audio file provided", "", "", "Please upload an audio file."
-    # Call the integrated workflow function
     integrated_result = process_audio_and_search(audio_path)
-    # Initialize output variables
     detected_language_output = ""
     transcription_output = ""
     translated_text_output = ""
     semantic_search_output_string = ""
-    # Check the status of the result
     if integrated_result["status"] == "success":
-        # Extract audio processing results
         audio_processing = integrated_result["audio_processing"]
         detected_language_output = audio_processing["detected_language"]
         transcription_output = audio_processing["transcription"]
         translated_text_output = audio_processing["translated_text"]
-        # Extract semantic search results
         semantic_search = integrated_result["semantic_search"]
-        # Format semantic search output to show top 3 results
         if semantic_search["status"] == "success":
             semantic_search_output_string = "--- Top 3 Semantic Search Results ---\n\n"
             for result in semantic_search["results"]:
@@ -938,45 +1026,49 @@ def gradio_interface_fn(audio_path):
                     )
     else:
-        # Handle audio processing failure
         error_message = integrated_result.get("error", "An unknown error occurred during audio processing.")
         detected_language_output = f"Error: {error_message}"
         transcription_output = "N/A"
         translated_text_output = "N/A"
         semantic_search_output_string = "Semantic search could not be performed due to audio processing error."
-    # Return the formatted outputs as a tuple
     return (detected_language_output, transcription_output, translated_text_output, semantic_search_output_string)
 def create_gradio_app():
     """Create the Gradio interface."""
-    # Define input component for audio file
     audio_input = gr.Audio(type="filepath", label="Upload Audio File")
-    # Define output components for audio processing results
     detected_language_output = gr.Textbox(label="Detected Language")
     transcription_output = gr.Textbox(label="Transcription")
     translated_text_output = gr.Textbox(label="Translated Text")
     semantic_search_output = gr.Textbox(label="Semantic Search Results")
-    # Create the interface
     iface = gr.Interface(
         fn=gradio_interface_fn,
         inputs=audio_input,
         outputs=[detected_language_output, transcription_output, translated_text_output, semantic_search_output],
         title="🌾 Multilingual Agricultural Voice Assistant",
-        description="""
         Upload an audio file in English or any of the 22+ supported Indic languages.
         The system will:
         1. 🎧 Detect the language automatically
         2. 📝 Transcribe the speech with punctuation
-        3. 🌍 Translate to English using **IndicTransToolkit + IndicTrans2**
         4. 🔍 Find relevant agricultural answers from the knowledge base
         **Supported Languages:** English, Hindi, Bengali, Telugu, Tamil, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Odia, Assamese, Urdu, Nepali, Sanskrit, and more!
-        **🔧 Translation**: IndicTransToolkit with IndicTrans2 model (official implementation)
         """,
         examples=[],
         theme=gr.themes.Soft(),
@@ -990,23 +1082,30 @@ if __name__ == "__main__":
     print("\n" + "="*60)
     print("🌾 MULTILINGUAL AGRICULTURAL VOICE ASSISTANT")
     print("="*60)
-    print("🔧 Translation: IndicTransToolkit + IndicTrans2 Model")
     print("🎯 Features available:")
     print("   • Multi-format audio processing (15+ formats)")
     print("   • Whisper-based English detection and transcription")
     print("   • MMS-LID for 22+ Indic language detection")
     print("   • IndicConformer for Indic language ASR")
     print("   • Bidirectional Gemma3 punctuation (31 punctuation types)")
-    print("   • IndicTransToolkit for professional-grade translation")
     print("   • Semantic Q&A search")
     print("="*60)
-    # Load models on startup
     print("🚀 Loading models...")
     models = load_models()
     qa_system = load_qa_system()
-    # Create and launch the Gradio app
     print("🎪 Launching Gradio interface...")
     app = create_gradio_app()
     app.launch()

 """
 Multilingual Voice-Based Agricultural Recommendation System
+Updated for TorchAudio 2.8+ deprecations and TorchCodec migration
 Optimized for Hugging Face Spaces deployment with Whisper-first pipeline
 """
 from __future__ import annotations
 import torch
+import warnings
 import json
 import os
 import re
 from transformers import AutoModel, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from transformers import AutoTokenizer, PretrainedConfig, PreTrainedModel
+from transformers import AutoModelForSeq2SeqLM
 from pathlib import Path
 import torch.nn as nn
 from transformers import Gemma3ForCausalLM, Gemma3TextConfig
 from transformers.modeling_outputs import TokenClassifierOutput
 from transformers.utils import logging
 from sentence_transformers import SentenceTransformer, util
+import librosa  # Alternative to torchaudio
+import soundfile as sf  # Alternative audio loading
+# Try to import TorchCodec and TorchAudio with fallbacks
+try:
+    import torchcodec
+    from torchcodec import AudioDecoder
+    TORCHCODEC_AVAILABLE = True
+    print("✅ TorchCodec available - using new audio loading")
+except ImportError:
+    TORCHCODEC_AVAILABLE = False
+    print("⚠️ TorchCodec not available - using fallback methods")
+try:
+    import torchaudio
+    # Suppress TorchAudio deprecation warnings for backends
+    warnings.filterwarnings("ignore", category=UserWarning, module="torchaudio")
+    TORCHAUDIO_AVAILABLE = True
+    print("✅ TorchAudio available - with deprecation handling")
+except ImportError:
+    TORCHAUDIO_AVAILABLE = False
+    torchaudio = None
+    print("⚠️ TorchAudio not available - using librosa fallback")
+try:
+    from IndicTransToolkit.processor import IndicProcessor
+    INDICTRANS_TOOLKIT_AVAILABLE = True
+    print("✅ IndicTransToolkit available")
+except ImportError:
+    INDICTRANS_TOOLKIT_AVAILABLE = False
+    print("⚠️ IndicTransToolkit not available - using basic preprocessing")
 logger = logging.get_logger(__name__)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # --- CONFIGURATION ---
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+if HF_TOKEN:
+    from huggingface_hub import login
+    try:
+        login(HF_TOKEN)
+        print("✅ Successfully logged in to Hugging Face!")
+    except Exception as e:
+        print(f"⚠️ HF login failed: {e}")
+# --- FALLBACK INDIC PROCESSOR FOR WHEN TOOLKIT IS NOT AVAILABLE ---
+class BasicIndicProcessor:
+    """Basic fallback processor when IndicTransToolkit is not available"""
+    def __init__(self, inference=True):
+        self.inference = inference
+    def preprocess_batch(self, sentences, src_lang, tgt_lang):
+        """Basic preprocessing - add language tokens"""
+        processed_sentences = []
+        for sentence in sentences:
+            processed_sentence = f"<2{tgt_lang}> {sentence.strip()}"
+            processed_sentences.append(processed_sentence)
+        return processed_sentences
+    def postprocess_batch(self, sentences, lang):
+        """Basic postprocessing - remove special tokens"""
+        processed_sentences = []
+        for sentence in sentences:
+            processed_sentence = sentence.strip()
+            if processed_sentence.startswith('<2'):
+                processed_sentence = processed_sentence.split('>', 1)[-1].strip()
+            processed_sentences.append(processed_sentence)
+        return processed_sentences
 # --- CUSTOM GEMMA3 BIDIRECTIONAL MODEL FOR PUNCTUATION ---
 class Gemma3PunctuationConfig(Gemma3TextConfig):
 def detect_audio_format(audio_path: str) -> str:
     return Path(audio_path).suffix.lower()
+def load_audio_torchcodec(audio_path: str, target_sr: int = 16000) -> tuple:
+    """Load audio using TorchCodec (new recommended method)"""
     try:
+        print(f"🔧 Loading audio with TorchCodec: {audio_path}")
+        # Use TorchCodec AudioDecoder
+        decoder = AudioDecoder(audio_path)
+        # Get audio info
+        metadata = decoder.metadata
+        original_sr = int(metadata.sample_rate)
+        # Decode audio
+        audio_data = decoder.decode()  # Returns tensor
+        waveform = audio_data.audio  # Get audio tensor
+        print(f"🎵 TorchCodec loaded audio: {waveform.shape} at {original_sr} Hz")
+        # Convert to mono if stereo
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+            print(f"🔄 Converted from stereo to mono")
+        # Resample if needed
+        if original_sr != target_sr:
+            print(f"🔄 Resampling from {original_sr} Hz to {target_sr} Hz...")
+            # Use torchaudio functional for resampling (still available)
+            if TORCHAUDIO_AVAILABLE:
+                waveform = torchaudio.functional.resample(
+                    waveform,
+                    orig_freq=original_sr,
+                    new_freq=target_sr
+                )
+            else:
+                # Fallback to librosa
+                waveform_np = waveform.numpy()
+                waveform_resampled = librosa.resample(
+                    waveform_np[0],
+                    orig_sr=original_sr,
+                    target_sr=target_sr
+                )
+                waveform = torch.tensor(waveform_resampled).unsqueeze(0)
+            print(f"✅ Resampled to {target_sr} Hz")
+        print(f"✅ TorchCodec final audio: {waveform.shape} at {target_sr} Hz")
+        return waveform, target_sr
+    except Exception as e:
+        print(f"❌ TorchCodec loading failed: {e}")
+        raise e
+def load_audio_librosa(audio_path: str, target_sr: int = 16000) -> tuple:
+    """Load audio using librosa (fallback method)"""
     try:
+        print(f"🔧 Loading audio with librosa: {audio_path}")
+        # Load with librosa
+        waveform_np, sr = librosa.load(audio_path, sr=target_sr, mono=True)
+        # Convert to torch tensor and add channel dimension
+        waveform = torch.tensor(waveform_np).unsqueeze(0)
+        print(f"✅ Librosa loaded audio: {waveform.shape} at {target_sr} Hz")
+        return waveform, target_sr
+    except Exception as e:
+        print(f"❌ Librosa loading failed: {e}")
+        raise e
+def load_audio_torchaudio_legacy(audio_path: str, target_sr: int = 16000) -> tuple:
+    """Load audio using legacy TorchAudio (with backend handling)"""
+    try:
+        print(f"🔧 Loading audio with TorchAudio (legacy): {audio_path}")
+        # Try different backends
+        backends_to_try = []
+        if TORCHAUDIO_AVAILABLE:
+            try:
+                # Suppress the deprecation warning temporarily
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    available_backends = torchaudio.list_audio_backends()
+                backends_to_try = available_backends
+            except Exception:
+                backends_to_try = ['soundfile', 'sox_io']
+        audio_format = detect_audio_format(audio_path)
+        print(f"🎵 Audio format: {audio_format}")
+        print(f"🔧 Available backends: {backends_to_try}")
+        waveform = None
+        orig_sr = None
+        # Try to load with different backends
+        for backend in backends_to_try + [None]:  # None for default
             try:
                 if backend:
+                    print(f"🔄 Trying {backend} backend...")
+                    if hasattr(torchaudio, 'set_audio_backend'):
+                        torchaudio.set_audio_backend(backend)
                     waveform, orig_sr = torchaudio.load(audio_path, backend=backend)
                 else:
+                    print(f"🔄 Trying default backend...")
                     waveform, orig_sr = torchaudio.load(audio_path)
+                print(f"✅ Successfully loaded with {backend or 'default'} backend")
+                break
+            except Exception as e:
+                print(f"❌ {backend or 'default'} backend failed: {e}")
+                continue
+        if waveform is None:
+            raise Exception("All TorchAudio backends failed")
         print(f"🎵 Loaded audio: {waveform.shape} at {orig_sr} Hz")
+        # Convert to mono
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+            print(f"🔄 Converted from stereo to mono")
+        # Resample if needed
         if orig_sr != target_sr:
             print(f"🔄 Resampling from {orig_sr} Hz to {target_sr} Hz...")
             waveform = torchaudio.functional.resample(
                 new_freq=target_sr
             )
             print(f"✅ Resampled to {target_sr} Hz")
         return waveform, target_sr
+    except Exception as e:
+        print(f"❌ TorchAudio legacy loading failed: {e}")
+        raise e
+def preprocess_audio(audio_path: str, target_sr: int = 16000) -> tuple:
+    """
+    Preprocess audio with multiple fallback methods for TorchAudio 2.8+ compatibility
+    """
+    try:
+        original_audio_format = detect_audio_format(audio_path)
+        print(f"🎵 Detected original format: {original_audio_format}")
+        # Method 1: Try TorchCodec (recommended for future)
+        if TORCHCODEC_AVAILABLE:
+            try:
+                return load_audio_torchcodec(audio_path, target_sr)
+            except Exception as e:
+                print(f"⚠️ TorchCodec failed: {e}")
+        # Method 2: Try TorchAudio legacy (with deprecation handling)
+        if TORCHAUDIO_AVAILABLE:
+            try:
+                return load_audio_torchaudio_legacy(audio_path, target_sr)
+            except Exception as e:
+                print(f"⚠️ TorchAudio legacy failed: {e}")
+        # Method 3: Fallback to librosa
+        try:
+            return load_audio_librosa(audio_path, target_sr)
+        except Exception as e:
+            print(f"⚠️ Librosa fallback failed: {e}")
+        raise Exception("All audio loading methods failed")
     except Exception as e:
         error_msg = f"❌ Error in audio preprocessing: {str(e)}"
         print(error_msg)
         raise Exception(error_msg)
 # --- GLOBAL MODEL STORAGE ---
 models = {}
 qa_system = {}
     """Load all models with caching using global variables."""
     global models
+    if models:
         print("✅ Models already loaded from cache")
         return models
         models['punctuation_model'] = None
         models['punctuation_id2label'] = None
+    # Load IndicTrans2 model
     try:
         print("🔄 Loading IndicTrans2 for translation...")
         model_name = "ai4bharat/indictrans2-indic-en-1B"
         models['indictrans_tokenizer'] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         models['indictrans_model'] = AutoModelForSeq2SeqLM.from_pretrained(
             model_name,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
         ).to(device)
+        # Use IndicTransToolkit if available, otherwise use basic processor
+        if INDICTRANS_TOOLKIT_AVAILABLE:
+            models['indic_processor'] = IndicProcessor(inference=True)
+            print("✅ IndicTrans2 loaded with IndicTransToolkit")
+        else:
+            models['indic_processor'] = BasicIndicProcessor(inference=True)
+            print("✅ IndicTrans2 loaded with basic processor")
     except Exception as e:
         print(f"❌ Error loading IndicTrans2 model: {e}")
         models['indictrans_tokenizer'] = None
         models['indictrans_model'] = None
         models['indic_processor'] = None
     """Load Q&A system with caching using global variables."""
     global qa_system
+    if qa_system:
         print("✅ Q&A system already loaded from cache")
         return qa_system
     print("🚀 Loading Q&A system for the first time...")
     try:
         if os.path.exists("cleaned_qa_dataset.xlsx"):
             df = pd.read_excel("cleaned_qa_dataset.xlsx")
             qa_pairs = df[['Question', 'Answer']].dropna().drop_duplicates().reset_index(drop=True)
 def translate_with_indictrans2(text: str, source_lang: str = "hin_Deva") -> Dict:
     """
+    Translate Indic language text to English using IndicTrans2 model.
     """
     try:
         if not models.get('indictrans_model') or not models.get('indictrans_tokenizer') or not models.get('indic_processor'):
                 "translated_text": ""
             }
+        print(f"🔄 Translating with IndicTrans2: {source_lang} -> eng_Latn")
         input_sentences = [text.strip()]
         # Preprocess with IndicProcessor
             clean_up_tokenization_spaces=True,
         )
+        # Postprocess the translations
         translations = models['indic_processor'].postprocess_batch(generated_tokens, lang="eng_Latn")
         translated_text = translations[0] if translations else ""
         }
     try:
         user_question_embedding = qa_system['model'].encode(user_question, convert_to_tensor=True)
         similarities = util.cos_sim(user_question_embedding, qa_system['question_embeddings'])
         top_results = torch.topk(similarities, k=top_k)
         results = []
             print("🇺🇸 Processing as English audio...")
             detected_lang_str = "Detected Language: English (Whisper Detection)"
             punctuated_transcription = add_punctuation(whisper_transcription)
             print(f"Original Whisper: {whisper_transcription}")
             print(f"With punctuation: {punctuated_transcription}")
             translation_result = punctuated_transcription
             return (
             print("🔍 Using MMS-LID for detailed language identification...")
             inputs = models['lid_processor'](waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
                 outputs = models['lid_model'](**inputs)
             if not models.get('asr_model'):
                 return "ASR model not available.", "", ""
             print(f"🔤 Transcribing with IndicConformer ({detected_lang_name})...")
             with torch.no_grad():
                 transcription = models['asr_model'](waveform_16k.to(device), asr_lang_code, "rnnt")
             print("✅ IndicConformer transcription complete.")
             punctuated_transcription = add_punctuation(transcription.strip()) if transcription else ""
             print(f"Original: {transcription}")
             print(f"With punctuation: {punctuated_transcription}")
+            # Translation to English using IndicTrans2
             translation_result = ""
             translation_error = ""
             if punctuated_transcription:
                 indictrans_lang_code = ASR_TO_INDICTRANS_MAP.get(asr_lang_code)
                 if indictrans_lang_code:
+                    print(f"🔄 Translating {detected_lang_name} to English with IndicTrans2...")
                     translation_response = translate_with_indictrans2(
                         punctuated_transcription,
                         indictrans_lang_code
                     if translation_response["success"]:
                         translation_result = translation_response["translated_text"]
+                        print("✅ IndicTrans2 translation complete.")
                     else:
                         translation_error = translation_response["error"]
                         translation_result = "Translation failed"
             else:
                 translation_result = "No text to translate"
             if translation_error:
                 translation_display = f"❌ {translation_result}\nError: {translation_error}"
             else:
     """Process audio and perform semantic search."""
     print(f"--- Processing audio file with Whisper-first pipeline: {audio_path} ---")
     detected_language, transcription, translated_text = transcribe_audio_with_lid(audio_path)
     if "Error" in detected_language:
         return {
             "status": "audio_processing_failed",
     if not audio_path:
         return "No audio file provided", "", "", "Please upload an audio file."
     integrated_result = process_audio_and_search(audio_path)
     detected_language_output = ""
     transcription_output = ""
     translated_text_output = ""
     semantic_search_output_string = ""
     if integrated_result["status"] == "success":
         audio_processing = integrated_result["audio_processing"]
         detected_language_output = audio_processing["detected_language"]
         transcription_output = audio_processing["transcription"]
         translated_text_output = audio_processing["translated_text"]
         semantic_search = integrated_result["semantic_search"]
         if semantic_search["status"] == "success":
             semantic_search_output_string = "--- Top 3 Semantic Search Results ---\n\n"
             for result in semantic_search["results"]:
                     )
     else:
         error_message = integrated_result.get("error", "An unknown error occurred during audio processing.")
         detected_language_output = f"Error: {error_message}"
         transcription_output = "N/A"
         translated_text_output = "N/A"
         semantic_search_output_string = "Semantic search could not be performed due to audio processing error."
     return (detected_language_output, transcription_output, translated_text_output, semantic_search_output_string)
 def create_gradio_app():
     """Create the Gradio interface."""
     audio_input = gr.Audio(type="filepath", label="Upload Audio File")
     detected_language_output = gr.Textbox(label="Detected Language")
     transcription_output = gr.Textbox(label="Transcription")
     translated_text_output = gr.Textbox(label="Translated Text")
     semantic_search_output = gr.Textbox(label="Semantic Search Results")
+    audio_backend_info = ""
+    if TORCHCODEC_AVAILABLE:
+        audio_backend_info = "🎵 **Audio Backend**: TorchCodec (recommended)"
+    elif TORCHAUDIO_AVAILABLE:
+        audio_backend_info = "🎵 **Audio Backend**: TorchAudio (legacy with deprecation handling)"
+    else:
+        audio_backend_info = "🎵 **Audio Backend**: Librosa (fallback)"
     iface = gr.Interface(
         fn=gradio_interface_fn,
         inputs=audio_input,
         outputs=[detected_language_output, transcription_output, translated_text_output, semantic_search_output],
         title="🌾 Multilingual Agricultural Voice Assistant",
+        description=f"""
         Upload an audio file in English or any of the 22+ supported Indic languages.
         The system will:
         1. 🎧 Detect the language automatically
         2. 📝 Transcribe the speech with punctuation
+        3. 🌍 Translate to English using **IndicTrans2**
         4. 🔍 Find relevant agricultural answers from the knowledge base
         **Supported Languages:** English, Hindi, Bengali, Telugu, Tamil, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Odia, Assamese, Urdu, Nepali, Sanskrit, and more!
+        {audio_backend_info}
+        **🔧 Translation**: IndicTrans2 with robust preprocessing
+        **⚠️ Note**: Updated for TorchAudio 2.8+ deprecations
         """,
         examples=[],
         theme=gr.themes.Soft(),
     print("\n" + "="*60)
     print("🌾 MULTILINGUAL AGRICULTURAL VOICE ASSISTANT")
     print("="*60)
+    if TORCHCODEC_AVAILABLE:
+        print("🎵 Audio Backend: TorchCodec (recommended)")
+    elif TORCHAUDIO_AVAILABLE:
+        print("🎵 Audio Backend: TorchAudio (legacy with deprecation handling)")
+    else:
+        print("🎵 Audio Backend: Librosa (fallback)")
+    print("🔧 Translation: IndicTrans2 Model")
+    print("⚠️ Updated for TorchAudio 2.8+ deprecations")
     print("🎯 Features available:")
     print("   • Multi-format audio processing (15+ formats)")
     print("   • Whisper-based English detection and transcription")
     print("   • MMS-LID for 22+ Indic language detection")
     print("   • IndicConformer for Indic language ASR")
     print("   • Bidirectional Gemma3 punctuation (31 punctuation types)")
+    print("   • IndicTrans2 for professional translation")
     print("   • Semantic Q&A search")
     print("="*60)
     print("🚀 Loading models...")
     models = load_models()
     qa_system = load_qa_system()
     print("🎪 Launching Gradio interface...")
     app = create_gradio_app()
     app.launch()