Noumida commited on
Commit
5da4a63
·
verified ·
1 Parent(s): 366a764

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -540
app.py CHANGED
@@ -7,6 +7,8 @@ import requests
7
  import json
8
  import os
9
  import re
 
 
10
  from typing import List, Dict, Optional, Union
11
  from transformers import AutoModel, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
12
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
@@ -35,7 +37,7 @@ asr_model.eval()
35
  print("✅ ASR Model loaded.")
36
 
37
  print("\nLoading Whisper model for English...")
38
- model_name = "openai/whisper-large"
39
  whisper_processor = WhisperProcessor.from_pretrained(model_name)
40
  whisper_model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
41
  print("✅ Whisper Model loaded.")
@@ -47,9 +49,8 @@ lid_model = AutoModelForAudioClassification.from_pretrained(lid_model_id).to(dev
47
  lid_model.eval()
48
  print("✅ Language ID Model loaded.")
49
 
50
- # [Keep all your existing mappings and functions - they remain the same]
51
  LID_TO_ASR_LANG_MAP = {
52
- # MMS-style codes (e.g., hin_Deva)
53
  "asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
54
  "guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
55
  "kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
@@ -91,7 +92,42 @@ LANGUAGE_OPTIONS = {
91
  "Santali": "sat_Olck"
92
  }
93
 
94
- # [Keep all your existing functions - transcribe_with_whisper, recommend_questions_from_text, etc.]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  def transcribe_with_whisper(audio_path):
96
  """Transcribe English audio using Whisper."""
97
  try:
@@ -134,7 +170,7 @@ Please provide exactly 5 questions, formatted as:
134
 
135
  Make the questions thoughtful and educational."""
136
 
137
- # Initialize Gemini model
138
  model = genai.GenerativeModel('gemini-1.5-flash')
139
 
140
  # Generate content with Gemini
@@ -313,7 +349,7 @@ def transcribe_audio_with_lid(audio_path):
313
  detected_lang_name = ASR_CODE_TO_NAME.get(asr_lang_code, 'Unknown')
314
  detected_lang_str = f"Detected Language: {detected_lang_name} ({detected_lid_code})"
315
 
316
- # UPDATED: Use Whisper Transformers for English, IndicConformer for others
317
  if asr_lang_code == "en":
318
  # Use Whisper Transformers for English audio
319
  transcription_rnnt = transcribe_with_whisper(audio_path)
@@ -377,11 +413,42 @@ def transcribe_audio_with_lid(audio_path):
377
  questions_result
378
  )
379
 
380
- # --- FIXED: Gradio UI with Proper Audio Recording ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  with gr.Blocks(theme=gr.themes.Soft(), title="Indic STT + Translation + Question Recommendations") as demo:
382
  gr.Markdown(f"## {DESCRIPTION}")
383
  gr.Markdown("""
384
- Upload/record audio OR input text in English or any of the 22 supported Indian languages
 
 
385
  """)
386
 
387
  with gr.Row():
@@ -393,7 +460,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Indic STT + Translation + Question
393
  label="Choose Input Method"
394
  )
395
 
396
- # FIXED: Audio input with proper recording configuration
397
  audio = gr.Audio(
398
  label="Upload or Record Audio",
399
  sources=["upload", "microphone"], # Enable both upload and microphone
@@ -420,7 +487,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Indic STT + Translation + Question
420
  )
421
 
422
  process_btn = gr.Button(
423
- " Process & Get Question Recommendations",
424
  variant="primary",
425
  scale=2
426
  )
@@ -428,12 +495,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Indic STT + Translation + Question
428
  with gr.Column(scale=2):
429
  # Detection/Processing Result
430
  detection_output = gr.Label(
431
- label=" Processing Result",
432
  show_label=True
433
  )
434
 
435
  # Input/Transcription Results
436
- with gr.Tab(" Input/Transcription"):
437
  gr.Markdown("### Original Text")
438
  input_output = gr.Textbox(
439
  lines=4,
@@ -442,7 +509,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Indic STT + Translation + Question
442
  )
443
 
444
  # Translation Results
445
- with gr.Tab(" Translation"):
446
  translation_output = gr.Textbox(
447
  lines=4,
448
  label="English Translation",
@@ -450,7 +517,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Indic STT + Translation + Question
450
  )
451
 
452
  # Question Recommendations
453
- with gr.Tab("Question Recommendations"):
454
  questions_output = gr.Textbox(
455
  lines=8,
456
  label="Recommended Questions",
@@ -470,11 +537,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Indic STT + Translation + Question
470
  outputs=[audio, text_input, language_dropdown]
471
  )
472
 
473
- # Main processing function that handles both audio and text
474
  def process_input(method, audio_file, text, language):
475
  if method == "Audio Input":
476
  if audio_file:
477
- return transcribe_audio_with_lid(audio_file)
 
478
  else:
479
  return "Please upload or record an audio file.", "", "", ""
480
  else: # Text Input
@@ -502,527 +570,3 @@ if __name__ == "__main__":
502
  server_port=7860,
503
  share=True
504
  )
505
-
506
-
507
-
508
-
509
-
510
-
511
-
512
-
513
-
514
-
515
-
516
-
517
-
518
-
519
- # from __future__ import annotations
520
- # import torch
521
- # import torchaudio
522
- # import gradio as gr
523
- # import spaces
524
- # import requests
525
- # import json
526
- # import os
527
- # import re
528
- # from typing import List, Dict, Optional, Union
529
- # from transformers import AutoModel, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
530
- # from transformers import WhisperProcessor, WhisperForConditionalGeneration
531
- # import google.generativeai as genai
532
-
533
- # DESCRIPTION = "Question Generation"
534
- # device = "cuda" if torch.cuda.is_available() else "cpu"
535
-
536
- # # --- API Configuration ---
537
- # API_BASE_URL = "https://e1389f0b40fe.ngrok-free.app"
538
- # HEADERS = {
539
- # "Content-Type": "application/json",
540
- # "ngrok-skip-browser-warning": "true" # Skip ngrok browser warning
541
- # }
542
-
543
- # # Google Gemini API Configuration - Using your existing API key
544
- # GOOGLE_API_KEY = "AIzaSyDKNz6FpMu2KiRxEcTdWbg1MO-ctGfamy8"
545
- # os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
546
- # genai.configure(api_key=GOOGLE_API_KEY)
547
-
548
- # # --- Model Loading ---
549
- # print("Loading ASR model (IndicConformer)...")
550
- # asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
551
- # asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
552
- # asr_model.eval()
553
- # print("✅ ASR Model loaded.")
554
-
555
- # print("\nLoading Whisper model for English...")
556
- # model_name = "openai/whisper-small"
557
- # whisper_processor = WhisperProcessor.from_pretrained(model_name)
558
- # whisper_model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
559
- # print("✅ Whisper Model loaded.")
560
-
561
- # print("\nLoading Language ID model (MMS-LID-1024)...")
562
- # lid_model_id = "facebook/mms-lid-1024"
563
- # lid_processor = Wav2Vec2FeatureExtractor.from_pretrained(lid_model_id)
564
- # lid_model = AutoModelForAudioClassification.from_pretrained(lid_model_id).to(device)
565
- # lid_model.eval()
566
- # print("✅ Language ID Model loaded.")
567
-
568
- # # --- Language Mappings ---
569
- # LID_TO_ASR_LANG_MAP = {
570
- # # MMS-style codes (e.g., hin_Deva)
571
- # "asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
572
- # "guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
573
- # "kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
574
- # "mni_Beng": "mni", "mar_Deva": "mr", "nep_Deva": "ne", "ory_Orya": "or",
575
- # "pan_Guru": "pa", "san_Deva": "sa", "sat_Olck": "sat", "snd_Arab": "sd",
576
- # "tam_Taml": "ta", "tel_Telu": "te", "urd_Arab": "ur",
577
- # "asm": "as", "ben": "bn", "brx": "br", "doi": "doi", "guj": "gu", "hin": "hi",
578
- # "kan": "kn", "kas": "ks", "gom": "kok", "mai": "mai", "mal": "ml", "mni": "mni",
579
- # "mar": "mr", "npi": "ne", "ory": "or", "pan": "pa", "san": "sa", "sat": "sat",
580
- # "snd": "sd", "tam": "ta", "tel": "te", "urd": "ur", "eng": "en"
581
- # }
582
-
583
- # ASR_CODE_TO_NAME = {
584
- # "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati",
585
- # "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili",
586
- # "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia",
587
- # "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil",
588
- # "te": "Telugu", "ur": "Urdu", "en": "English"
589
- # }
590
-
591
- # # ASR to IndicTrans2 language code mapping
592
- # ASR_TO_INDICTRANS_MAP = {
593
- # "as": "asm_Beng", "bn": "ben_Beng", "br": "brx_Deva", "doi": "doi_Deva",
594
- # "gu": "guj_Gujr", "hi": "hin_Deva", "kn": "kan_Knda", "ks": "kas_Deva",
595
- # "kok": "gom_Deva", "mai": "mai_Deva", "ml": "mal_Mlym", "mni": "mni_Beng",
596
- # "mr": "mar_Deva", "ne": "nep_Deva", "or": "ory_Orya", "pa": "pan_Guru",
597
- # "sa": "san_Deva", "sat": "sat_Olck", "sd": "snd_Arab", "ta": "tam_Taml",
598
- # "te": "tel_Telu", "ur": "urd_Arab", "en": "eng_Latn"
599
- # }
600
-
601
- # # Language dropdown options for text input - WITH ENGLISH
602
- # LANGUAGE_OPTIONS = {
603
- # "English": "eng_Latn",
604
- # "Hindi": "hin_Deva", "Bengali": "ben_Beng", "Telugu": "tel_Telu",
605
- # "Tamil": "tam_Taml", "Gujarati": "guj_Gujr", "Kannada": "kan_Knda",
606
- # "Malayalam": "mal_Mlym", "Marathi": "mar_Deva", "Punjabi": "pan_Guru",
607
- # "Odia": "ory_Orya", "Assamese": "asm_Beng", "Urdu": "urd_Arab",
608
- # "Nepali": "nep_Deva", "Sanskrit": "san_Deva", "Kashmiri": "kas_Deva",
609
- # "Sindhi": "snd_Arab", "Bodo": "brx_Deva", "Dogri": "doi_Deva",
610
- # "Konkani": "gom_Deva", "Maithili": "mai_Deva", "Manipuri": "mni_Beng",
611
- # "Santali": "sat_Olck"
612
- # }
613
-
614
- # # --- Whisper Transcription Function ---
615
- # def transcribe_with_whisper(audio_path):
616
- # """Transcribe English audio using Whisper."""
617
- # try:
618
- # # Load and resample audio
619
- # waveform, sr = torchaudio.load(audio_path)
620
- # waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
621
-
622
- # # Prepare input features
623
- # input_features = whisper_processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt").input_features.to(device)
624
-
625
- # # Generate tokens with model
626
- # predicted_ids = whisper_model.generate(input_features)
627
-
628
- # # Decode tokens to text
629
- # transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
630
-
631
- # return transcription.strip()
632
- # except Exception as e:
633
- # return f"Error during transcription: {str(e)}"
634
-
635
- # # --- Question Recommendation Functions using Google Gemini API ---
636
- # def recommend_questions_from_text(text: str) -> Dict:
637
- # """
638
- # Recommend 5 questions from text using Google Gemini API.
639
- # """
640
- # if not text or len(text.strip()) < 20:
641
- # return {
642
- # "success": False,
643
- # "error": "Text too short to generate meaningful questions.",
644
- # "questions": ["Text too short to generate meaningful questions."]
645
- # }
646
-
647
- # try:
648
- # # Create a prompt for Gemini to recommend 5 questions
649
- # prompt = f"""Based on the following text, recommend exactly 5 insightful and relevant questions that someone might ask about this content. Focus on comprehension, analysis, and deeper understanding.
650
-
651
- # Text: {text}
652
-
653
- # Please provide exactly 5 questions, formatted as:
654
- # 1. [Question 1]
655
- # 2. [Question 2]
656
- # 3. [Question 3]
657
- # 4. [Question 4]
658
- # 5. [Question 5]
659
-
660
- # Make the questions thoughtful and educational."""
661
-
662
- # # Initialize Gemini model
663
- # model = genai.GenerativeModel('gemini-2.5-flash')
664
-
665
- # # Generate content with Gemini
666
- # response = model.generate_content(
667
- # prompt,
668
- # generation_config=genai.types.GenerationConfig(
669
- # max_output_tokens=400,
670
- # temperature=0.7,
671
- # )
672
- # )
673
-
674
- # # Extract the response text
675
- # questions_text = response.text.strip()
676
-
677
- # # Parse the numbered questions
678
- # questions = []
679
- # for line in questions_text.split('\n'):
680
- # line = line.strip()
681
- # if line and (line[0].isdigit() or line.startswith('-')):
682
- # # Remove numbering (1., 2., etc.)
683
- # if '.' in line:
684
- # question = line[line.find('.')+1:].strip()
685
- # else:
686
- # question = line[1:].strip()
687
-
688
- # # Clean up brackets if present
689
- # question = question.strip('[]')
690
- # if question:
691
- # questions.append(question)
692
-
693
- # # Ensure we have exactly 5 questions
694
- # if len(questions) < 5:
695
- # # Add generic questions if needed
696
- # while len(questions) < 5:
697
- # questions.append(f"What can you learn from this text? (Question {len(questions) + 1})")
698
-
699
- # questions = questions[:5] # Limit to 5
700
-
701
- # return {
702
- # "success": True,
703
- # "questions": questions,
704
- # "total_questions": len(questions),
705
- # "source": "Google Gemini API"
706
- # }
707
-
708
- # except Exception as e:
709
- # # Fallback to simple questions if API fails
710
- # fallback_questions = [
711
- # "What is the main topic discussed in this text?",
712
- # "What are the key points mentioned?",
713
- # "How does this information relate to the broader context?",
714
- # "What questions would you ask about this content?",
715
- # "What can you learn from this text?"
716
- # ]
717
-
718
- # return {
719
- # "success": False,
720
- # "error": f"Gemini API call failed: {str(e)}",
721
- # "questions": fallback_questions,
722
- # "source": "Fallback"
723
- # }
724
-
725
- # # --- Translation Functions ---
726
- # def translate_indic_to_english(text: str, source_lang: str = "hin_Deva") -> Dict:
727
- # """Translate Indic language text to English."""
728
- # try:
729
- # url = f"{API_BASE_URL}/translate/indic-to-en"
730
- # payload = {
731
- # "text": text,
732
- # "source_lang": source_lang
733
- # }
734
-
735
- # response = requests.post(url, json=payload, headers=HEADERS, timeout=30)
736
- # response.raise_for_status()
737
-
738
- # result = response.json()
739
- # return {
740
- # "success": True,
741
- # "translated_text": result.get("translated_text", ""),
742
- # "source_lang": source_lang,
743
- # "target_lang": "eng_Latn"
744
- # }
745
-
746
- # except requests.exceptions.RequestException as e:
747
- # return {
748
- # "success": False,
749
- # "error": f"API request failed: {str(e)}",
750
- # "translated_text": ""
751
- # }
752
- # except Exception as e:
753
- # return {
754
- # "success": False,
755
- # "error": f"Translation error: {str(e)}",
756
- # "translated_text": ""
757
- # }
758
-
759
- # # --- Text Processing Function with Gemini API-based Question Recommendation ---
760
- # def process_text_input(text: str, language: str) -> tuple:
761
- # """Process direct text input for translation and question recommendation."""
762
- # if not text or not text.strip():
763
- # return "Please provide text input.", "", "", ""
764
-
765
- # # Get language code
766
- # lang_code = LANGUAGE_OPTIONS.get(language, "hin_Deva")
767
-
768
- # try:
769
- # # Check if input is English
770
- # if language == "English" or lang_code == "eng_Latn":
771
- # # For English input, skip translation and directly recommend questions
772
- # translation_result = text.strip()
773
-
774
- # # Recommend questions using Gemini API
775
- # question_response = recommend_questions_from_text(translation_result)
776
- # if question_response["success"]:
777
- # questions_list = question_response["questions"]
778
- # questions_result = "\n".join([f"Q{i+1}: {q}" for i, q in enumerate(questions_list)])
779
- # questions_result += f"\n\n✅ Questions recommended via {question_response.get('source', 'API')}"
780
- # else:
781
- # questions_result = f"❌ Question recommendation failed: {question_response.get('error', 'Unknown error')}"
782
- # else:
783
- # # For Indic languages, translate to English first
784
- # translation_response = translate_indic_to_english(text.strip(), lang_code)
785
-
786
- # if translation_response["success"]:
787
- # translation_result = translation_response["translated_text"]
788
-
789
- # # Recommend questions using Gemini API
790
- # question_response = recommend_questions_from_text(translation_result)
791
- # if question_response["success"]:
792
- # questions_list = question_response["questions"]
793
- # questions_result = "\n".join([f"Q{i+1}: {q}" for i, q in enumerate(questions_list)])
794
- # questions_result += f"\n\n✅ Questions recommended via {question_response.get('source', 'API')}"
795
- # else:
796
- # questions_result = f"❌ Question recommendation failed: {question_response.get('error', 'Unknown error')}"
797
-
798
- # else:
799
- # translation_result = f"❌ Translation failed: {translation_response['error']}"
800
- # questions_result = "Cannot recommend questions without valid translation."
801
-
802
- # except Exception as e:
803
- # return f"Error processing text: {str(e)}", "", "", ""
804
-
805
- # return (
806
- # f"Text Input Processed (Language: {language})",
807
- # text.strip(),
808
- # translation_result,
809
- # questions_result
810
- # )
811
-
812
- # @spaces.GPU
813
- # def transcribe_audio_with_lid(audio_path):
814
- # """Main function to transcribe audio with language detection, translation, and question recommendation."""
815
- # if not audio_path:
816
- # return "Please provide an audio file.", "", "", ""
817
-
818
- # try:
819
- # # Load and resample audio
820
- # waveform, sr = torchaudio.load(audio_path)
821
- # waveform_16k = torchaudio.functional.resample(waveform, sr, 16000)
822
- # except Exception as e:
823
- # return f"Error loading audio: {e}", "", "", ""
824
-
825
- # try:
826
- # # Language detection
827
- # inputs = lid_processor(waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
828
- # with torch.no_grad():
829
- # outputs = lid_model(**inputs)
830
-
831
- # logits = outputs[0]
832
- # predicted_lid_id = logits.argmax(-1).item()
833
- # detected_lid_code = lid_model.config.id2label[predicted_lid_id]
834
- # asr_lang_code = LID_TO_ASR_LANG_MAP.get(detected_lid_code)
835
-
836
- # if not asr_lang_code:
837
- # detected_lang_str = f"Detected '{detected_lid_code}', which is not supported by the ASR model."
838
- # return detected_lang_str, "N/A", "N/A", "N/A"
839
-
840
- # detected_lang_name = ASR_CODE_TO_NAME.get(asr_lang_code, 'Unknown')
841
- # detected_lang_str = f"Detected Language: {detected_lang_name} ({detected_lid_code})"
842
-
843
- # # UPDATED: Use Whisper Transformers for English, IndicConformer for others
844
- # if asr_lang_code == "en":
845
- # # Use Whisper Transformers for English audio
846
- # transcription_rnnt = transcribe_with_whisper(audio_path)
847
- # else:
848
- # # Use IndicConformer for Indic languages - RNNT ONLY
849
- # with torch.no_grad():
850
- # transcription_rnnt = asr_model(waveform_16k.to(device), asr_lang_code, "rnnt")
851
-
852
- # # Translation to English
853
- # translation_result = ""
854
- # translation_error = ""
855
-
856
- # if transcription_rnnt.strip() and asr_lang_code != "en":
857
- # # Get IndicTrans2 language code
858
- # indictrans_lang_code = ASR_TO_INDICTRANS_MAP.get(asr_lang_code)
859
- # if indictrans_lang_code:
860
- # # Translate to English
861
- # translation_response = translate_indic_to_english(
862
- # transcription_rnnt.strip(),
863
- # indictrans_lang_code
864
- # )
865
-
866
- # if translation_response["success"]:
867
- # translation_result = translation_response["translated_text"]
868
- # else:
869
- # translation_error = translation_response["error"]
870
- # translation_result = "Translation failed"
871
- # else:
872
- # translation_result = "Translation not supported for this language"
873
- # elif asr_lang_code == "en":
874
- # translation_result = transcription_rnnt.strip() # Use original text
875
- # else:
876
- # translation_result = "No text to translate"
877
-
878
- # # Recommend questions using Gemini API
879
- # questions_result = ""
880
- # if translation_result and not translation_error and translation_result not in ["No text to translate", "Translation failed"]:
881
- # question_response = recommend_questions_from_text(translation_result)
882
- # if question_response["success"]:
883
- # questions_list = question_response["questions"]
884
- # questions_result = "\n".join([f"Q{i+1}: {q}" for i, q in enumerate(questions_list)])
885
- # questions_result += f"\n\n✅ Questions recommended via {question_response.get('source', 'API')}"
886
- # else:
887
- # questions_result = f"❌ Question recommendation failed: {question_response.get('error', 'Unknown error')}"
888
- # else:
889
- # questions_result = "Cannot recommend questions without valid translation."
890
-
891
- # # Combine results
892
- # if translation_error:
893
- # translation_display = f"❌ {translation_result}\nError: {translation_error}"
894
- # else:
895
- # translation_display = translation_result
896
-
897
- # except Exception as e:
898
- # return f"Error during processing: {str(e)}", "", "", ""
899
-
900
- # return (
901
- # detected_lang_str,
902
- # transcription_rnnt.strip(),
903
- # translation_display,
904
- # questions_result
905
- # )
906
-
907
- # # --- Gradio UI ---
908
- # with gr.Blocks(theme=gr.themes.Soft(), title="Indic STT + Translation + Question Recommendations") as demo:
909
- # gr.Markdown(f"## {DESCRIPTION}")
910
- # gr.Markdown("""
911
- # Upload/record audio OR input text in English or any of the 22 supported Indian languages""")
912
-
913
- # with gr.Row():
914
- # with gr.Column(scale=1):
915
- # # Input method selection
916
- # input_method = gr.Radio(
917
- # choices=["Audio Input", "Text Input"],
918
- # value="Audio Input",
919
- # label="Choose Input Method"
920
- # )
921
-
922
- # # Audio input (visible by default)
923
- # audio = gr.Audio(
924
- # label="Upload or Record Audio",
925
- # type="filepath",
926
- # format="wav",
927
- # visible=True
928
- # )
929
-
930
- # # Text input (hidden by default)
931
- # text_input = gr.Textbox(
932
- # label="Enter Text in English or Indian Language",
933
- # placeholder="Type your text here in English, Hindi, Bengali, Tamil, etc...",
934
- # lines=4,
935
- # visible=False
936
- # )
937
-
938
- # # Language selection for text input (hidden by default)
939
- # language_dropdown = gr.Dropdown(
940
- # choices=list(LANGUAGE_OPTIONS.keys()),
941
- # value="English",
942
- # label="Select Language",
943
- # visible=False
944
- # )
945
-
946
- # process_btn = gr.Button(
947
- # " Process & Get Question Recommendations",
948
- # variant="primary",
949
- # scale=2
950
- # )
951
-
952
- # with gr.Column(scale=2):
953
- # # Detection/Processing Result
954
- # detection_output = gr.Label(
955
- # label=" Processing Result",
956
- # show_label=True
957
- # )
958
-
959
- # # Input/Transcription Results
960
- # with gr.Tab(" Input/Transcription"):
961
- # gr.Markdown("### Original Text")
962
- # input_output = gr.Textbox(
963
- # lines=4,
964
- # label="Input/Transcription Output",
965
- # placeholder="Original text will appear here..."
966
- # )
967
-
968
- # # Translation Results
969
- # with gr.Tab(" Translation"):
970
- # translation_output = gr.Textbox(
971
- # lines=4,
972
- # label="English Translation",
973
- # placeholder="English translation will appear here..."
974
- # )
975
-
976
- # # Question Recommendations
977
- # with gr.Tab(" Question Recommendations"):
978
- # questions_output = gr.Textbox(
979
- # lines=8,
980
- # label="Recommended Questions",
981
- # placeholder="Gemini AI-recommended questions will appear here..."
982
- # )
983
-
984
- # # Toggle input visibility based on method selection
985
- # def toggle_inputs(method):
986
- # if method == "Audio Input":
987
- # return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
988
- # else: # Text Input
989
- # return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
990
-
991
- # input_method.change(
992
- # fn=toggle_inputs,
993
- # inputs=[input_method],
994
- # outputs=[audio, text_input, language_dropdown]
995
- # )
996
-
997
- # # Main processing function that handles both audio and text
998
- # def process_input(method, audio_file, text, language):
999
- # if method == "Audio Input":
1000
- # if audio_file:
1001
- # return transcribe_audio_with_lid(audio_file)
1002
- # else:
1003
- # return "Please upload an audio file.", "", "", ""
1004
- # else: # Text Input
1005
- # if text:
1006
- # return process_text_input(text, language)
1007
- # else:
1008
- # return "Please enter some text.", "", "", ""
1009
-
1010
- # # Event handlers
1011
- # process_btn.click(
1012
- # fn=process_input,
1013
- # inputs=[input_method, audio, text_input, language_dropdown],
1014
- # outputs=[
1015
- # detection_output,
1016
- # input_output,
1017
- # translation_output,
1018
- # questions_output
1019
- # ],
1020
- # api_name="process"
1021
- # )
1022
-
1023
- # if __name__ == "__main__":
1024
- # demo.queue(max_size=10).launch(
1025
- # server_name="0.0.0.0",
1026
- # server_port=7860,
1027
- # share=True
1028
- # )
 
7
  import json
8
  import os
9
  import re
10
+ import tempfile
11
+ import shutil
12
  from typing import List, Dict, Optional, Union
13
  from transformers import AutoModel, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
14
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
37
  print("✅ ASR Model loaded.")
38
 
39
  print("\nLoading Whisper model for English...")
40
+ model_name = "openai/whisper-small"
41
  whisper_processor = WhisperProcessor.from_pretrained(model_name)
42
  whisper_model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
43
  print("✅ Whisper Model loaded.")
 
49
  lid_model.eval()
50
  print("✅ Language ID Model loaded.")
51
 
52
+ # [Keep all your existing mappings]
53
  LID_TO_ASR_LANG_MAP = {
 
54
  "asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
55
  "guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
56
  "kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
 
92
  "Santali": "sat_Olck"
93
  }
94
 
95
+ # --- NEW: TEMPORARY STORAGE FUNCTIONS ---
96
+ def create_temp_audio_file(audio_path: str) -> str:
97
+ """
98
+ Create a temporary copy of the audio file for processing.
99
+ Returns the path to the temporary file.
100
+ """
101
+ if not audio_path:
102
+ return None
103
+
104
+ try:
105
+ # Create a unique temporary file
106
+ temp_fd, temp_path = tempfile.mkstemp(suffix=".wav", prefix="audio_temp_")
107
+ os.close(temp_fd) # Close the file descriptor
108
+
109
+ # Copy the original audio to temp location
110
+ shutil.copy2(audio_path, temp_path)
111
+
112
+ print(f"📁 Audio temporarily stored at: {temp_path}")
113
+ return temp_path
114
+
115
+ except Exception as e:
116
+ print(f"❌ Error creating temporary audio file: {str(e)}")
117
+ return audio_path # Fall back to original path
118
+
119
+ def cleanup_temp_file(temp_path: str):
120
+ """
121
+ Clean up temporary audio file after processing.
122
+ """
123
+ try:
124
+ if temp_path and os.path.exists(temp_path) and "temp" in temp_path:
125
+ os.unlink(temp_path)
126
+ print(f"🗑️ Cleaned up temporary file: {temp_path}")
127
+ except Exception as e:
128
+ print(f"⚠️ Warning: Could not clean up temp file: {str(e)}")
129
+
130
+ # [Keep your existing functions exactly as they are]
131
  def transcribe_with_whisper(audio_path):
132
  """Transcribe English audio using Whisper."""
133
  try:
 
170
 
171
  Make the questions thoughtful and educational."""
172
 
173
+ # Initialize Gemini model - KEEPING YOUR ORIGINAL MODEL
174
  model = genai.GenerativeModel('gemini-1.5-flash')
175
 
176
  # Generate content with Gemini
 
349
  detected_lang_name = ASR_CODE_TO_NAME.get(asr_lang_code, 'Unknown')
350
  detected_lang_str = f"Detected Language: {detected_lang_name} ({detected_lid_code})"
351
 
352
+ # Use Whisper Transformers for English, IndicConformer for others
353
  if asr_lang_code == "en":
354
  # Use Whisper Transformers for English audio
355
  transcription_rnnt = transcribe_with_whisper(audio_path)
 
413
  questions_result
414
  )
415
 
416
+ # --- NEW: AUDIO PROCESSING WITH TEMPORARY STORAGE ---
417
+ @spaces.GPU
418
+ def process_audio_with_temp_storage(audio_path):
419
+ """
420
+ Process audio with temporary storage for better handling of recorded audio.
421
+ """
422
+ if not audio_path:
423
+ return "Please provide an audio file.", "", "", ""
424
+
425
+ # Create temporary copy of audio file
426
+ temp_audio_path = create_temp_audio_file(audio_path)
427
+
428
+ try:
429
+ print(f"🎵 Processing audio file: {os.path.basename(temp_audio_path)}")
430
+
431
+ # Process the temporarily stored audio
432
+ result = transcribe_audio_with_lid(temp_audio_path)
433
+
434
+ print("✅ Audio processing completed successfully")
435
+ return result
436
+
437
+ except Exception as e:
438
+ print(f"❌ Error during audio processing: {str(e)}")
439
+ return f"Error processing audio: {str(e)}", "", "", ""
440
+
441
+ finally:
442
+ # Clean up temporary file
443
+ cleanup_temp_file(temp_audio_path)
444
+
445
+ # --- Gradio UI ---
446
  with gr.Blocks(theme=gr.themes.Soft(), title="Indic STT + Translation + Question Recommendations") as demo:
447
  gr.Markdown(f"## {DESCRIPTION}")
448
  gr.Markdown("""
449
+ 🎤 **Upload/record audio** OR **input text** in English or any of the 22 supported Indian languages
450
+
451
+ *Audio files are temporarily stored during processing and automatically cleaned up afterwards.*
452
  """)
453
 
454
  with gr.Row():
 
460
  label="Choose Input Method"
461
  )
462
 
463
+ # Audio input with proper recording configuration
464
  audio = gr.Audio(
465
  label="Upload or Record Audio",
466
  sources=["upload", "microphone"], # Enable both upload and microphone
 
487
  )
488
 
489
  process_btn = gr.Button(
490
+ "🚀 Process & Get Question Recommendations",
491
  variant="primary",
492
  scale=2
493
  )
 
495
  with gr.Column(scale=2):
496
  # Detection/Processing Result
497
  detection_output = gr.Label(
498
+ label="🔍 Processing Result",
499
  show_label=True
500
  )
501
 
502
  # Input/Transcription Results
503
+ with gr.Tab("📝 Input/Transcription"):
504
  gr.Markdown("### Original Text")
505
  input_output = gr.Textbox(
506
  lines=4,
 
509
  )
510
 
511
  # Translation Results
512
+ with gr.Tab("🌍 Translation"):
513
  translation_output = gr.Textbox(
514
  lines=4,
515
  label="English Translation",
 
517
  )
518
 
519
  # Question Recommendations
520
+ with gr.Tab("Question Recommendations"):
521
  questions_output = gr.Textbox(
522
  lines=8,
523
  label="Recommended Questions",
 
537
  outputs=[audio, text_input, language_dropdown]
538
  )
539
 
540
+ # UPDATED: Main processing function with temporary storage
541
  def process_input(method, audio_file, text, language):
542
  if method == "Audio Input":
543
  if audio_file:
544
+ # Use the new function with temporary storage
545
+ return process_audio_with_temp_storage(audio_file)
546
  else:
547
  return "Please upload or record an audio file.", "", "", ""
548
  else: # Text Input
 
570
  server_port=7860,
571
  share=True
572
  )