Spaces:

sematech
/

sema-api

Sleeping

App Files Files Community

kamau1 commited on Jun 21

Commit

06eadfa

1 Parent(s): a503604

feat(update): added a lang detetction endpoint that would be very good for chatbot implementations

Browse files

Files changed (5) hide show

app/api/v1/endpoints.py +145 -1
app/models/schemas.py +118 -0
app/services/translation.py +33 -0
curl_commands.md → tests/curl_commands.md +39 -1
tests/simple_test.py +32 -0

app/api/v1/endpoints.py CHANGED Viewed

@@ -15,11 +15,14 @@ from ...models.schemas import (
     HealthResponse,
     LanguagesResponse,
     LanguageStatsResponse,
-    LanguageInfo
 )
 from ...services.translation import (
     translate_with_detection,
     translate_with_source,
     models_loaded
 )
 from ...services.languages import (
@@ -324,6 +327,147 @@ async def translate_endpoint(
         )
 @router.get(
     "/languages",
     response_model=LanguagesResponse,

     HealthResponse,
     LanguagesResponse,
     LanguageStatsResponse,
+    LanguageInfo,
+    LanguageDetectionRequest,
+    LanguageDetectionResponse
 )
 from ...services.translation import (
     translate_with_detection,
     translate_with_source,
+    detect_language,
     models_loaded
 )
 from ...services.languages import (
         )
+@router.post(
+    "/detect-language",
+    response_model=LanguageDetectionResponse,
+    tags=["Language Detection"],
+    summary="Detect Input Language",
+    description="Detect the language of input text for multilingual applications.",
+    responses={
+        200: {"description": "Language detected successfully"},
+        400: {"description": "Invalid request - empty text or text too long"},
+        429: {"description": "Rate limit exceeded"},
+        500: {"description": "Language detection service error"}
+    }
+)
+@limiter.limit(f"{settings.max_requests_per_minute}/minute")
+async def detect_language_endpoint(
+    detection_request: LanguageDetectionRequest,
+    request: Request
+):
+    """
+    ## 🔍 Detect Input Language
+    Detect the language of input text - perfect for multilingual chatbots and applications.
+    ### 🎯 Use Cases
+    - **Multilingual Chatbots**: Detect user language before processing
+    - **Content Routing**: Route content based on detected language
+    - **Auto-Translation**: Decide whether translation is needed
+    - **Language Analytics**: Track language usage patterns
+    ### 🤖 Chatbot Implementation Example
+    ```python
+    # 1. Detect user input language
+    detection = await detect_language(user_input)
+    # 2. Decide processing flow
+    if detection.is_english:
+        # Process directly in English
+        response = await llm_chat(user_input)
+    else:
+        # Translate to English, process, translate back
+        english_input = await translate(user_input, "eng_Latn")
+        english_response = await llm_chat(english_input)
+        response = await translate(english_response, detection.detected_language)
+    ```
+    ### ✨ Features
+    - **High Accuracy**: FastText-based language detection
+    - **200+ Languages**: Supports all FLORES-200 languages
+    - **Confidence Scores**: Get detection confidence (0.0-1.0)
+    - **English Flag**: Quick check if input is English
+    - **Fast Processing**: ~0.01-0.05 seconds detection time
+    ### 📊 Response Information
+    - **Language Code**: FLORES-200 format (e.g., swh_Latn)
+    - **Language Names**: Both English and native names
+    - **Confidence Score**: Detection accuracy (higher = more confident)
+    - **English Flag**: Boolean for quick English detection
+    - **Character Count**: Input text length for analytics
+    ### 🔒 Limits
+    - **Rate Limit**: 60 requests per minute per IP
+    - **Text Length**: Maximum 1000 characters
+    - **Minimum Length**: At least 1 character required
+    """
+    request_id = request.state.request_id
+    # Validate text length
+    if len(detection_request.text) > 1000:
+        raise HTTPException(
+            status_code=413,
+            detail="Text too long. Maximum 1000 characters allowed for language detection."
+        )
+    full_date, _ = get_nairobi_time()
+    character_count = len(detection_request.text)
+    # Log detection request
+    logger.info(
+        "language_detection_started",
+        request_id=request_id,
+        character_count=character_count
+    )
+    try:
+        # Detect language
+        detected_lang_code, confidence = detect_language(detection_request.text)
+        # Get language information
+        language_info = get_language_info(detected_lang_code)
+        # Handle case where language is not in our database (fallback)
+        if not language_info:
+            language_name = detected_lang_code
+            native_name = detected_lang_code
+        else:
+            language_name = language_info["name"]
+            native_name = language_info["native_name"]
+        # Check if detected language is English
+        is_english = detected_lang_code in ["eng_Latn", "eng_Arab"]
+        # Log successful detection
+        logger.info(
+            "language_detection_completed",
+            request_id=request_id,
+            detected_language=detected_lang_code,
+            confidence=confidence,
+            is_english=is_english,
+            character_count=character_count
+        )
+        return LanguageDetectionResponse(
+            detected_language=detected_lang_code,
+            language_name=language_name,
+            native_name=native_name,
+            confidence=confidence,
+            is_english=is_english,
+            character_count=character_count,
+            timestamp=full_date,
+            request_id=request_id
+        )
+    except Exception as e:
+        # Log detection error
+        logger.error(
+            "language_detection_failed",
+            request_id=request_id,
+            error=str(e),
+            error_type=type(e).__name__,
+            character_count=character_count
+        )
+        # Update error metrics
+        ERROR_COUNT.labels(error_type="language_detection_error").inc()
+        raise HTTPException(
+            status_code=500,
+            detail="Language detection service temporarily unavailable. Please try again later."
+        )
 @router.get(
     "/languages",
     response_model=LanguagesResponse,

app/models/schemas.py CHANGED Viewed

@@ -151,6 +151,124 @@ class HealthResponse(BaseModel):
     timestamp: str = Field(..., description="Current timestamp")
 class ErrorResponse(BaseModel):
     """Response model for error responses"""

     timestamp: str = Field(..., description="Current timestamp")
+class LanguageDetectionRequest(BaseModel):
+    """
+    Language detection request model
+    For detecting the language of input text.
+    """
+    text: str = Field(
+        ...,
+        example="Habari ya asubuhi",
+        description="Text to detect language for (1-1000 characters)",
+        min_length=1,
+        max_length=1000,
+        title="Input Text"
+    )
+    class Config:
+        json_schema_extra = {
+            "examples": [
+                {
+                    "summary": "Swahili text detection",
+                    "description": "Detect language for Swahili greeting",
+                    "value": {
+                        "text": "Habari ya asubuhi"
+                    }
+                },
+                {
+                    "summary": "English text detection",
+                    "description": "Detect language for English text",
+                    "value": {
+                        "text": "Good morning, how are you?"
+                    }
+                },
+                {
+                    "summary": "French text detection",
+                    "description": "Detect language for French text",
+                    "value": {
+                        "text": "Bonjour, comment allez-vous?"
+                    }
+                }
+            ]
+        }
+class LanguageDetectionResponse(BaseModel):
+    """
+    Language detection response model
+    Contains detected language information and confidence.
+    """
+    detected_language: str = Field(
+        ...,
+        description="Detected language code in FLORES-200 format",
+        example="swh_Latn",
+        title="Detected Language Code"
+    )
+    language_name: str = Field(
+        ...,
+        description="Human-readable name of detected language",
+        example="Swahili",
+        title="Language Name"
+    )
+    native_name: str = Field(
+        ...,
+        description="Native name of detected language",
+        example="Kiswahili",
+        title="Native Language Name"
+    )
+    confidence: float = Field(
+        ...,
+        description="Detection confidence score (0.0 to 1.0)",
+        example=0.9876,
+        ge=0.0,
+        le=1.0,
+        title="Confidence Score"
+    )
+    is_english: bool = Field(
+        ...,
+        description="Whether the detected language is English",
+        example=False,
+        title="Is English"
+    )
+    character_count: int = Field(
+        ...,
+        description="Number of characters in input text",
+        example=17,
+        ge=1,
+        title="Character Count"
+    )
+    timestamp: str = Field(
+        ...,
+        description="Detection timestamp in Nairobi timezone",
+        example="Monday | 2024-06-21 | 14:30:25",
+        title="Timestamp"
+    )
+    request_id: str = Field(
+        ...,
+        description="Unique request identifier for debugging",
+        example="550e8400-e29b-41d4-a716-446655440000",
+        title="Request ID"
+    )
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "detected_language": "swh_Latn",
+                "language_name": "Swahili",
+                "native_name": "Kiswahili",
+                "confidence": 0.9876,
+                "is_english": False,
+                "character_count": 17,
+                "timestamp": "Monday | 2024-06-21 | 14:30:25",
+                "request_id": "550e8400-e29b-41d4-a716-446655440000"
+            }
+        }
 class ErrorResponse(BaseModel):
     """Response model for error responses"""

app/services/translation.py CHANGED Viewed

@@ -220,6 +220,39 @@ def translate_with_source(text: str, source_lang: str, target_lang: str) -> Tupl
         raise e
 def models_loaded() -> bool:
     """Check if all models are loaded"""
     return all([lang_model, sp_model, translator])

         raise e
+def detect_language(text: str) -> Tuple[str, float]:
+    """
+    Detect the language of input text
+    Returns:
+        Tuple of (language_code, confidence_score)
+    """
+    try:
+        # Clean text for better detection
+        cleaned_text = text.replace('\n', ' ').strip()
+        # Get predictions with confidence scores
+        predictions = lang_model.predict(cleaned_text, k=1)
+        # Extract language code and confidence
+        language_code = predictions[0][0].replace('__label__', '')
+        confidence = float(predictions[1][0])
+        logger.info(
+            "language_detected",
+            text_length=len(text),
+            detected_language=language_code,
+            confidence=confidence
+        )
+        return language_code, confidence
+    except Exception as e:
+        logger.error("language_detection_failed", error=str(e), error_type=type(e).__name__)
+        # Re-raise the exception to be handled by the endpoint
+        raise e
 def models_loaded() -> bool:
     """Check if all models are loaded"""
     return all([lang_model, sp_model, translator])

curl_commands.md → tests/curl_commands.md RENAMED Viewed

@@ -60,7 +60,45 @@ curl -X POST "$API_URL/api/v1/translate" \
   }'
 ```
-## 🗣️ Language Information Endpoints
 ### Get All Supported Languages
 ```bash

   }'
 ```
+## � Language Detection Endpoints
+### Detect Language of Text
+```bash
+curl -X POST "$API_URL/detect-language" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "Habari ya asubuhi"
+  }'
+```
+### Detect Language (English Text)
+```bash
+curl -X POST "$API_URL/detect-language" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "Good morning, how are you today?"
+  }'
+```
+### Detect Language (French Text)
+```bash
+curl -X POST "$API_URL/detect-language" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "Bonjour, comment allez-vous?"
+  }'
+```
+### Detect Language (Versioned Endpoint)
+```bash
+curl -X POST "$API_URL/api/v1/detect-language" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "Hola, ¿cómo estás?"
+  }'
+```
+## �🗣️ Language Information Endpoints
 ### Get All Supported Languages
 ```bash

tests/simple_test.py CHANGED Viewed

@@ -108,6 +108,37 @@ def test_search():
     print("-" * 50)
 def run_all_tests():
     """Run all tests"""
     print(f"🧪 Testing API at: {API_URL}")
@@ -117,6 +148,7 @@ def run_all_tests():
     test_translation()
     test_languages()
     test_search()
     print("🎉 All tests completed!")

     print("-" * 50)
+def test_language_detection():
+    """Test language detection endpoint"""
+    print("🔍 Testing language detection...")
+    test_cases = [
+        {"text": "Habari ya asubuhi", "expected_lang": "swh_Latn"},
+        {"text": "Good morning", "expected_lang": "eng_Latn"},
+        {"text": "Bonjour", "expected_lang": "fra_Latn"},
+        {"text": "Hola mundo", "expected_lang": "spa_Latn"}
+    ]
+    for test_case in test_cases:
+        response = requests.post(
+            f"{API_URL}/detect-language",
+            headers={"Content-Type": "application/json"},
+            json={"text": test_case["text"]}
+        )
+        if response.status_code == 200:
+            data = response.json()
+            detected = data['detected_language']
+            confidence = data['confidence']
+            is_english = data['is_english']
+            print(f"✅ '{test_case['text']}' → {detected} ({data['language_name']})")
+            print(f"   Confidence: {confidence:.3f}, Is English: {is_english}")
+        else:
+            print(f"❌ Detection failed for '{test_case['text']}'")
+    print("-" * 50)
 def run_all_tests():
     """Run all tests"""
     print(f"🧪 Testing API at: {API_URL}")
     test_translation()
     test_languages()
     test_search()
+    test_language_detection()
     print("🎉 All tests completed!")