|
|
|
|
|
import re |
|
|
import torch |
|
|
import string |
|
|
from enum import Enum |
|
|
from typing import Dict |
|
|
from typing import List |
|
|
from typing import Tuple |
|
|
from loguru import logger |
|
|
from typing import Optional |
|
|
from dataclasses import dataclass |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
import langdetect |
|
|
from langdetect import detect, detect_langs, DetectorFactory |
|
|
|
|
|
DetectorFactory.seed = 0 |
|
|
LANGDETECT_AVAILABLE = True |
|
|
|
|
|
except ImportError: |
|
|
logger.warning("langdetect not available. Install: pip install langdetect") |
|
|
LANGDETECT_AVAILABLE = False |
|
|
|
|
|
try: |
|
|
from models.model_manager import get_model_manager |
|
|
MODEL_MANAGER_AVAILABLE = True |
|
|
|
|
|
except ImportError: |
|
|
logger.warning("model_manager not available, using fallback methods") |
|
|
MODEL_MANAGER_AVAILABLE = False |
|
|
|
|
|
|
|
|
class Language(Enum): |
|
|
""" |
|
|
ISO 639-1 language codes for supported languages |
|
|
""" |
|
|
ENGLISH = "en" |
|
|
SPANISH = "es" |
|
|
FRENCH = "fr" |
|
|
GERMAN = "de" |
|
|
ITALIAN = "it" |
|
|
PORTUGUESE = "pt" |
|
|
RUSSIAN = "ru" |
|
|
CHINESE = "zh" |
|
|
JAPANESE = "ja" |
|
|
KOREAN = "ko" |
|
|
ARABIC = "ar" |
|
|
HINDI = "hi" |
|
|
DUTCH = "nl" |
|
|
POLISH = "pl" |
|
|
TURKISH = "tr" |
|
|
SWEDISH = "sv" |
|
|
VIETNAMESE = "vi" |
|
|
INDONESIAN = "id" |
|
|
THAI = "th" |
|
|
GREEK = "el" |
|
|
HEBREW = "he" |
|
|
CZECH = "cs" |
|
|
ROMANIAN = "ro" |
|
|
DANISH = "da" |
|
|
FINNISH = "fi" |
|
|
NORWEGIAN = "no" |
|
|
UNKNOWN = "unknown" |
|
|
|
|
|
|
|
|
class Script(Enum): |
|
|
""" |
|
|
Writing scripts |
|
|
""" |
|
|
LATIN = "latin" |
|
|
CYRILLIC = "cyrillic" |
|
|
ARABIC = "arabic" |
|
|
CHINESE = "chinese" |
|
|
JAPANESE = "japanese" |
|
|
KOREAN = "korean" |
|
|
DEVANAGARI = "devanagari" |
|
|
GREEK = "greek" |
|
|
HEBREW = "hebrew" |
|
|
THAI = "thai" |
|
|
MIXED = "mixed" |
|
|
UNKNOWN = "unknown" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class LanguageDetectionResult: |
|
|
""" |
|
|
Result of language detection |
|
|
""" |
|
|
primary_language : Language |
|
|
confidence : float |
|
|
all_languages : Dict[str, float] |
|
|
script : Script |
|
|
is_multilingual : bool |
|
|
detection_method : str |
|
|
char_count : int |
|
|
word_count : int |
|
|
warnings : List[str] |
|
|
|
|
|
|
|
|
def to_dict(self) -> Dict: |
|
|
""" |
|
|
Convert to dictionary |
|
|
""" |
|
|
return {"primary_language" : self.primary_language.value, |
|
|
"confidence" : round(self.confidence, 4), |
|
|
"all_languages" : {k: round(v, 4) for k, v in self.all_languages.items()}, |
|
|
"script" : self.script.value, |
|
|
"is_multilingual" : self.is_multilingual, |
|
|
"detection_method" : self.detection_method, |
|
|
"char_count" : self.char_count, |
|
|
"word_count" : self.word_count, |
|
|
"warnings" : self.warnings, |
|
|
} |
|
|
|
|
|
|
|
|
class LanguageDetector: |
|
|
""" |
|
|
Detects the language of input text using multiple strategies with fallbacks. |
|
|
|
|
|
Features: |
|
|
- Primary : XLM-RoBERTa model (supports 100+ languages) |
|
|
- Fallback 1 : langdetect library (fast, probabilistic) |
|
|
- Fallback 2 : Character-based heuristics |
|
|
- Confidence scoring |
|
|
- Multi-language detection |
|
|
- Script detection (Latin, Cyrillic, Arabic, etc.) |
|
|
|
|
|
Supported Languages: |
|
|
- 100+ languages via XLM-RoBERTa |
|
|
- High accuracy for major languages (English, Spanish, French, German, Chinese, etc.) |
|
|
""" |
|
|
|
|
|
MIN_TEXT_LENGTH = 20 |
|
|
|
|
|
|
|
|
LANGUAGE_NAMES = {"en": "English", |
|
|
"es": "Spanish", |
|
|
"fr": "French", |
|
|
"de": "German", |
|
|
"it": "Italian", |
|
|
"pt": "Portuguese", |
|
|
"ru": "Russian", |
|
|
"zh": "Chinese", |
|
|
"ja": "Japanese", |
|
|
"ko": "Korean", |
|
|
"ar": "Arabic", |
|
|
"hi": "Hindi", |
|
|
} |
|
|
|
|
|
|
|
|
SCRIPT_RANGES = {Script.LATIN: [(0x0041, 0x007A), (0x00C0, 0x024F)], |
|
|
Script.CYRILLIC: [(0x0400, 0x04FF)], |
|
|
Script.ARABIC: [(0x0600, 0x06FF), (0x0750, 0x077F)], |
|
|
Script.CHINESE: [(0x4E00, 0x9FFF), (0x3400, 0x4DBF)], |
|
|
Script.JAPANESE: [(0x3040, 0x309F), (0x30A0, 0x30FF)], |
|
|
Script.KOREAN: [(0xAC00, 0xD7AF), (0x1100, 0x11FF)], |
|
|
Script.DEVANAGARI: [(0x0900, 0x097F)], |
|
|
Script.GREEK: [(0x0370, 0x03FF)], |
|
|
Script.HEBREW: [(0x0590, 0x05FF)], |
|
|
Script.THAI: [(0x0E00, 0x0E7F)], |
|
|
} |
|
|
|
|
|
|
|
|
def __init__(self, use_model: bool = True, min_confidence: float = 0.5): |
|
|
""" |
|
|
Initialize language detector |
|
|
|
|
|
Arguments: |
|
|
---------- |
|
|
use_model : Use ML model for detection (more accurate) |
|
|
|
|
|
min_confidence : Minimum confidence threshold |
|
|
""" |
|
|
self.use_model = use_model and MODEL_MANAGER_AVAILABLE |
|
|
self.min_confidence = min_confidence |
|
|
self.model_manager = None |
|
|
self.classifier = None |
|
|
self.is_initialized = False |
|
|
|
|
|
logger.info(f"LanguageDetector initialized (use_model={self.use_model})") |
|
|
|
|
|
|
|
|
def initialize(self) -> bool: |
|
|
""" |
|
|
Initialize the ML model (if using) |
|
|
|
|
|
Returns: |
|
|
-------- |
|
|
{ bool } : True if successful, False otherwise |
|
|
""" |
|
|
if not self.use_model: |
|
|
self.is_initialized = True |
|
|
return True |
|
|
|
|
|
try: |
|
|
logger.info("Initializing language detection model...") |
|
|
|
|
|
self.model_manager = get_model_manager() |
|
|
self.classifier = self.model_manager.load_pipeline(model_name = "language_detector", |
|
|
task = "text-classification", |
|
|
) |
|
|
|
|
|
self.is_initialized = True |
|
|
logger.success("Language detector initialized successfully") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to initialize language detector: {repr(e)}") |
|
|
logger.warning("Falling back to langdetect library") |
|
|
self.use_model = False |
|
|
self.is_initialized = True |
|
|
return False |
|
|
|
|
|
|
|
|
def detect(self, text: str, **kwargs) -> LanguageDetectionResult: |
|
|
""" |
|
|
Detect language of input text |
|
|
|
|
|
Arguments: |
|
|
---------- |
|
|
text { str } : Input text to analyze |
|
|
|
|
|
**kwargs : Additional options |
|
|
|
|
|
Returns: |
|
|
-------- |
|
|
LanguageDetectionResult object |
|
|
""" |
|
|
warnings = list() |
|
|
|
|
|
|
|
|
if not text or not isinstance(text, str): |
|
|
return self._create_unknown_result(text = "", |
|
|
warnings = ["Empty or invalid text"], |
|
|
) |
|
|
|
|
|
|
|
|
cleaned_text = self._clean_text(text) |
|
|
char_count = len(cleaned_text) |
|
|
word_count = len(cleaned_text.split()) |
|
|
|
|
|
|
|
|
if (char_count < self.MIN_TEXT_LENGTH): |
|
|
warnings.append(f"Text too short ({char_count} chars, minimum {self.MIN_TEXT_LENGTH}). Detection may be unreliable.") |
|
|
|
|
|
|
|
|
script = self._detect_script(cleaned_text) |
|
|
|
|
|
|
|
|
result = None |
|
|
|
|
|
|
|
|
if self.use_model and self.is_initialized: |
|
|
try: |
|
|
result = self._detect_with_model(text = cleaned_text) |
|
|
result.detection_method = "xlm-roberta-model" |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Model detection failed: {repr(e)}, trying fallback") |
|
|
warnings.append("Model detection failed, using fallback") |
|
|
|
|
|
|
|
|
if result is None and LANGDETECT_AVAILABLE: |
|
|
try: |
|
|
result = self._detect_with_langdetect(text = cleaned_text) |
|
|
result.detection_method = "langdetect-library" |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"langdetect failed: {repr(e)}, trying heuristics") |
|
|
warnings.append("langdetect failed, using heuristics") |
|
|
|
|
|
|
|
|
if result is None: |
|
|
result = self._detect_with_heuristics(cleaned_text, script) |
|
|
result.detection_method = "character-heuristics" |
|
|
|
|
|
|
|
|
result.script = script |
|
|
result.char_count = char_count |
|
|
result.word_count = word_count |
|
|
|
|
|
result.warnings.extend(warnings) |
|
|
|
|
|
|
|
|
if len([v for v in result.all_languages.values() if v > 0.2]) > 1: |
|
|
result.is_multilingual = True |
|
|
warnings.append("Text appears to contain multiple languages") |
|
|
|
|
|
logger.info(f"Detected language: {result.primary_language.value} (confidence: {result.confidence:.2f}, method: {result.detection_method})") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def _detect_with_model(self, text: str) -> LanguageDetectionResult: |
|
|
""" |
|
|
Detect language using XLM-RoBERTa model with sentence-based chunking for more accurate detection on long texts |
|
|
""" |
|
|
if not self.is_initialized: |
|
|
if not self.initialize(): |
|
|
raise RuntimeError("Model not initialized") |
|
|
|
|
|
try: |
|
|
|
|
|
chunks = self._split_text_into_chunks(text = text) |
|
|
logger.info(f"Split text into {len(chunks)} chunks for language detection") |
|
|
|
|
|
all_chunk_results = list() |
|
|
|
|
|
for i, chunk in enumerate(chunks): |
|
|
try: |
|
|
chunk_result = self._process_single_chunk(chunk = chunk) |
|
|
all_chunk_results.append(chunk_result) |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Chunk {i+1} processing failed: {repr(e)}") |
|
|
continue |
|
|
|
|
|
if not all_chunk_results: |
|
|
raise RuntimeError("All chunks failed processing") |
|
|
|
|
|
|
|
|
return self._aggregate_chunk_results(chunk_results = all_chunk_results) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Chunk-based model detection failed: {repr(e)}") |
|
|
raise |
|
|
|
|
|
|
|
|
def _split_text_into_chunks(self, text: str, max_chunk_length: int = 500, min_chunk_length: int = 50) -> List[str]: |
|
|
""" |
|
|
Split text into meaningful chunks for language detection |
|
|
|
|
|
Arguments: |
|
|
---------- |
|
|
text { str } : Input text |
|
|
|
|
|
max_chunk_length { int } : Maximum characters per chunk |
|
|
|
|
|
min_chunk_length { int } : Minimum characters per chunk |
|
|
|
|
|
Returns: |
|
|
-------- |
|
|
List of text chunks |
|
|
""" |
|
|
if (len(text) <= max_chunk_length): |
|
|
return [text] |
|
|
|
|
|
|
|
|
sentences = re.split(r'[.!?]+', text) |
|
|
sentences = [s.strip() for s in sentences if s.strip()] |
|
|
|
|
|
chunks = list() |
|
|
|
|
|
current_chunk = "" |
|
|
|
|
|
for sentence in sentences: |
|
|
|
|
|
if len(current_chunk) + len(sentence) + 1 <= max_chunk_length: |
|
|
if current_chunk: |
|
|
current_chunk += " " + sentence |
|
|
|
|
|
else: |
|
|
current_chunk = sentence |
|
|
|
|
|
else: |
|
|
|
|
|
if current_chunk and len(current_chunk) >= min_chunk_length: |
|
|
chunks.append(current_chunk) |
|
|
|
|
|
|
|
|
current_chunk = sentence |
|
|
|
|
|
|
|
|
if (current_chunk and (len(current_chunk) >= min_chunk_length)): |
|
|
chunks.append(current_chunk) |
|
|
|
|
|
|
|
|
if ((len(chunks) == 0) or ((len(chunks) == 1 )and (len(chunks[0]) > max_chunk_length))): |
|
|
chunks = self._split_fixed_length(text, max_chunk_length) |
|
|
|
|
|
logger.debug(f"Split {len(text)} chars into {len(chunks)} chunks: {[len(c) for c in chunks]}") |
|
|
return chunks |
|
|
|
|
|
|
|
|
def _split_fixed_length(self, text: str, chunk_size: int = 1000) -> List[str]: |
|
|
""" |
|
|
Fallback: Split text into fixed-length chunks |
|
|
""" |
|
|
chunks = list() |
|
|
|
|
|
for i in range(0, len(text), chunk_size): |
|
|
chunk = text[i:i + chunk_size] |
|
|
|
|
|
if ((i + chunk_size) < len(text)): |
|
|
last_space = chunk.rfind(' ') |
|
|
|
|
|
if (last_space > chunk_size * 0.7): |
|
|
chunk = chunk[:last_space].strip() |
|
|
|
|
|
chunks.append(chunk) |
|
|
return chunks |
|
|
|
|
|
|
|
|
def _process_single_chunk(self, chunk: str) -> Dict: |
|
|
""" |
|
|
Process a single chunk through the language detection model |
|
|
""" |
|
|
|
|
|
tokenizer = self.classifier.tokenizer |
|
|
|
|
|
|
|
|
inputs = tokenizer(chunk, |
|
|
return_tensors = "pt", |
|
|
truncation = True, |
|
|
max_length = 512, |
|
|
padding = True, |
|
|
add_special_tokens = True, |
|
|
) |
|
|
|
|
|
|
|
|
model = self.classifier.model |
|
|
device = next(model.parameters()).device |
|
|
|
|
|
|
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
|
|
|
|
|
|
top_predictions = torch.topk(predictions[0], k = 3) |
|
|
|
|
|
chunk_results = dict() |
|
|
|
|
|
for i in range(len(top_predictions.indices)): |
|
|
lang_idx = top_predictions.indices[i].item() |
|
|
score = top_predictions.values[i].item() |
|
|
|
|
|
|
|
|
lang_code = model.config.id2label[lang_idx] |
|
|
|
|
|
|
|
|
if ('_' in lang_code): |
|
|
lang_code = lang_code.split('_')[0] |
|
|
|
|
|
chunk_results[lang_code] = score |
|
|
|
|
|
return chunk_results |
|
|
|
|
|
|
|
|
def _aggregate_chunk_results(self, chunk_results: List[Dict]) -> LanguageDetectionResult: |
|
|
""" |
|
|
Aggregate results from multiple chunks using weighted averaging |
|
|
""" |
|
|
|
|
|
all_scores = dict() |
|
|
chunk_weights = list() |
|
|
|
|
|
for chunk_result in chunk_results: |
|
|
|
|
|
top_score = max(chunk_result.values()) if chunk_result else 0 |
|
|
|
|
|
chunk_weight = top_score |
|
|
|
|
|
chunk_weights.append(chunk_weight) |
|
|
|
|
|
for lang_code, score in chunk_result.items(): |
|
|
if lang_code not in all_scores: |
|
|
all_scores[lang_code] = list() |
|
|
|
|
|
all_scores[lang_code].append(score) |
|
|
|
|
|
|
|
|
weighted_scores = dict() |
|
|
|
|
|
for lang_code, scores in all_scores.items(): |
|
|
if (len(scores) != len(chunk_weights)): |
|
|
|
|
|
weighted_scores[lang_code] = sum(scores) / len(scores) |
|
|
|
|
|
else: |
|
|
|
|
|
weighted_sum = sum(score * weight for score, weight in zip(scores, chunk_weights)) |
|
|
total_weight = sum(chunk_weights) |
|
|
weighted_scores[lang_code] = weighted_sum / total_weight if total_weight > 0 else sum(scores) / len(scores) |
|
|
|
|
|
|
|
|
primary_lang = None |
|
|
primary_conf = 0.0 |
|
|
|
|
|
for lang_code, score in weighted_scores.items(): |
|
|
if (score > primary_conf): |
|
|
primary_conf = score |
|
|
primary_lang = lang_code |
|
|
|
|
|
|
|
|
try: |
|
|
primary_language = Language(primary_lang) |
|
|
|
|
|
except ValueError: |
|
|
primary_language = Language.UNKNOWN |
|
|
|
|
|
|
|
|
detection_quality = self._assess_detection_quality(chunk_results, weighted_scores) |
|
|
|
|
|
warnings = list() |
|
|
|
|
|
if detection_quality.get('low_confidence', False): |
|
|
warnings.append("Low confidence across multiple chunks") |
|
|
|
|
|
if detection_quality.get('inconsistent', False): |
|
|
warnings.append("Inconsistent language detection across chunks") |
|
|
|
|
|
return LanguageDetectionResult(primary_language = primary_language, |
|
|
confidence = primary_conf, |
|
|
all_languages = weighted_scores, |
|
|
script = Script.UNKNOWN, |
|
|
is_multilingual = detection_quality.get('multilingual', False), |
|
|
detection_method = "model-chunked", |
|
|
char_count = 0, |
|
|
word_count = 0, |
|
|
warnings = warnings, |
|
|
) |
|
|
|
|
|
|
|
|
def _assess_detection_quality(self, chunk_results: List[Dict], final_scores: Dict[str, float]) -> Dict[str, bool]: |
|
|
""" |
|
|
Assess the quality and consistency of language detection across chunks |
|
|
""" |
|
|
quality_metrics = {'low_confidence' : False, |
|
|
'inconsistent' : False, |
|
|
'multilingual' : False, |
|
|
} |
|
|
|
|
|
if not chunk_results: |
|
|
return quality_metrics |
|
|
|
|
|
|
|
|
avg_top_confidence = sum(max(chunk.values()) for chunk in chunk_results) / len(chunk_results) |
|
|
if (avg_top_confidence < 0.6): |
|
|
quality_metrics['low_confidence'] = True |
|
|
|
|
|
|
|
|
chunk_primaries = list() |
|
|
|
|
|
for chunk in chunk_results: |
|
|
if chunk: |
|
|
primary = max(chunk.items(), key=lambda x: x[1])[0] |
|
|
chunk_primaries.append(primary) |
|
|
|
|
|
if (len(set(chunk_primaries)) > 1): |
|
|
quality_metrics['inconsistent'] = True |
|
|
|
|
|
|
|
|
strong_languages = [lang for lang, score in final_scores.items() if score > 0.2] |
|
|
if (len(strong_languages) > 1): |
|
|
quality_metrics['multilingual'] = True |
|
|
|
|
|
return quality_metrics |
|
|
|
|
|
|
|
|
def _detect_with_langdetect(self, text: str) -> LanguageDetectionResult: |
|
|
""" |
|
|
Detect language using langdetect library |
|
|
""" |
|
|
|
|
|
lang_probs = detect_langs(text) |
|
|
|
|
|
all_languages = dict() |
|
|
|
|
|
for prob in lang_probs: |
|
|
all_languages[prob.lang] = prob.prob |
|
|
|
|
|
|
|
|
primary = lang_probs[0] |
|
|
|
|
|
try: |
|
|
primary_language = Language(primary.lang) |
|
|
|
|
|
except ValueError: |
|
|
primary_language = Language.UNKNOWN |
|
|
|
|
|
return LanguageDetectionResult(primary_language = primary_language, |
|
|
confidence = primary.prob, |
|
|
all_languages = all_languages, |
|
|
script = Script.UNKNOWN, |
|
|
is_multilingual = False, |
|
|
detection_method = "langdetect", |
|
|
char_count = 0, |
|
|
word_count = 0, |
|
|
warnings = [], |
|
|
) |
|
|
|
|
|
|
|
|
def _detect_with_heuristics(self, text: str, script: Script) -> LanguageDetectionResult: |
|
|
""" |
|
|
Detect language using character-based heuristics |
|
|
""" |
|
|
|
|
|
script_to_language = {Script.CHINESE : Language.CHINESE, |
|
|
Script.JAPANESE : Language.JAPANESE, |
|
|
Script.KOREAN : Language.KOREAN, |
|
|
Script.ARABIC : Language.ARABIC, |
|
|
Script.CYRILLIC : Language.RUSSIAN, |
|
|
Script.DEVANAGARI : Language.HINDI, |
|
|
Script.GREEK : Language.GREEK, |
|
|
Script.HEBREW : Language.HEBREW, |
|
|
Script.THAI : Language.THAI, |
|
|
} |
|
|
|
|
|
|
|
|
if script in script_to_language: |
|
|
primary_language = script_to_language[script] |
|
|
|
|
|
confidence = 0.7 |
|
|
|
|
|
else: |
|
|
|
|
|
primary_language = self._detect_latin_language(text) |
|
|
|
|
|
confidence = 0.5 |
|
|
|
|
|
return LanguageDetectionResult(primary_language = primary_language, |
|
|
confidence = confidence, |
|
|
all_languages = {primary_language.value: confidence}, |
|
|
script = script, |
|
|
is_multilingual = False, |
|
|
detection_method = "heuristics", |
|
|
char_count = 0, |
|
|
word_count = 0, |
|
|
warnings = ["Detection using heuristics, accuracy may be limited"], |
|
|
) |
|
|
|
|
|
|
|
|
def _detect_latin_language(self, text: str) -> Language: |
|
|
""" |
|
|
Detect Latin-script language using common word patterns |
|
|
""" |
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
patterns = {Language.ENGLISH : ['the', 'and', 'is', 'in', 'to', 'of', 'a', 'that', 'it', 'with', 'for', 'on', 'this', 'are', 'was', 'be', 'have', 'from', 'or', 'by'], |
|
|
Language.SPANISH : ['el', 'la', 'de', 'que', 'y', 'en', 'un', 'por', 'con', 'no', 'una', 'para', 'es', 'al', 'como', 'del', 'los', 'se', 'las', 'su'], |
|
|
Language.FRENCH : ['le', 'de', 'un', 'être', 'et', 'à', 'il', 'avoir', 'ne', 'je', 'son', 'que', 'ce', 'du', 'quel', 'elle', 'dans', 'pour', 'au', 'avec'], |
|
|
Language.GERMAN : ['der', 'die', 'und', 'in', 'den', 'von', 'zu', 'das', 'mit', 'sich', 'des', 'auf', 'für', 'ist', 'im', 'dem', 'nicht', 'ein', 'eine', 'als'], |
|
|
Language.ITALIAN : ['di', 'e', 'il', 'la', 'che', 'per', 'un', 'in', 'è', 'a', 'non', 'una', 'da', 'sono', 'come', 'del', 'ma', 'si', 'nel', 'anche'], |
|
|
Language.PORTUGUESE : ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', 'é', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais'], |
|
|
} |
|
|
|
|
|
|
|
|
scores = dict() |
|
|
words = set(text_lower.split()) |
|
|
|
|
|
for lang, common_words in patterns.items(): |
|
|
score = sum(1 for word in common_words if word in words) |
|
|
scores[lang] = score |
|
|
|
|
|
|
|
|
if scores: |
|
|
best_lang = max(scores.items(), key = lambda x: x[1]) |
|
|
|
|
|
if (best_lang[1] > 2): |
|
|
return best_lang[0] |
|
|
|
|
|
|
|
|
return Language.ENGLISH |
|
|
|
|
|
|
|
|
def _detect_script(self, text: str) -> Script: |
|
|
""" |
|
|
Detect the writing script used in text |
|
|
""" |
|
|
|
|
|
script_counts = {script: 0 for script in Script if script not in [Script.MIXED, Script.UNKNOWN]} |
|
|
|
|
|
for char in text: |
|
|
if char in string.whitespace or char in string.punctuation: |
|
|
continue |
|
|
|
|
|
code_point = ord(char) |
|
|
|
|
|
for script, ranges in self.SCRIPT_RANGES.items(): |
|
|
for start, end in ranges: |
|
|
if (start <= code_point <= end): |
|
|
script_counts[script] += 1 |
|
|
break |
|
|
|
|
|
|
|
|
total_chars = sum(script_counts.values()) |
|
|
|
|
|
if (total_chars == 0): |
|
|
return Script.UNKNOWN |
|
|
|
|
|
|
|
|
script_percentages = {script: count / total_chars for script, count in script_counts.items() if count > 0} |
|
|
|
|
|
|
|
|
if (len(script_percentages) > 1): |
|
|
max_percentage = max(script_percentages.values()) |
|
|
if (max_percentage < 0.7): |
|
|
return Script.MIXED |
|
|
|
|
|
|
|
|
if script_percentages: |
|
|
return max(script_percentages.items(), key=lambda x: x[1])[0] |
|
|
|
|
|
return Script.UNKNOWN |
|
|
|
|
|
|
|
|
def _clean_text(self, text: str) -> str: |
|
|
""" |
|
|
Clean text for language detection |
|
|
""" |
|
|
|
|
|
text = re.sub(r'https?://\S+', '', text) |
|
|
text = re.sub(r'www\.\S+', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'\S+@\S+', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
|
|
|
def _create_unknown_result(self, text: str, warnings: List[str]) -> LanguageDetectionResult: |
|
|
""" |
|
|
Create result for unknown language |
|
|
""" |
|
|
return LanguageDetectionResult(primary_language = Language.UNKNOWN, |
|
|
confidence = 0.0, |
|
|
all_languages = {}, |
|
|
script = Script.UNKNOWN, |
|
|
is_multilingual = False, |
|
|
detection_method = "none", |
|
|
char_count = len(text), |
|
|
word_count = len(text.split()), |
|
|
warnings = warnings, |
|
|
) |
|
|
|
|
|
|
|
|
def is_language(self, text: str, target_language: Language, threshold: float = 0.7) -> bool: |
|
|
""" |
|
|
Check if text is in a specific language |
|
|
|
|
|
Arguments: |
|
|
---------- |
|
|
text : Input text |
|
|
|
|
|
target_language : Language to check for |
|
|
|
|
|
threshold : Minimum confidence threshold |
|
|
|
|
|
Returns: |
|
|
-------- |
|
|
{ bool } : True if text is in target language with sufficient confidence |
|
|
""" |
|
|
result = self.detect(text) |
|
|
return ((result.primary_language == target_language) and (result.confidence >= threshold)) |
|
|
|
|
|
|
|
|
def get_supported_languages(self) -> List[str]: |
|
|
""" |
|
|
Get list of supported language codes |
|
|
""" |
|
|
return [lang.value for lang in Language if (lang != Language.UNKNOWN)] |
|
|
|
|
|
|
|
|
def cleanup(self): |
|
|
""" |
|
|
Clean up resources |
|
|
""" |
|
|
self.classifier = None |
|
|
self.is_initialized = False |
|
|
|
|
|
|
|
|
|
|
|
def quick_detect(text: str, **kwargs) -> LanguageDetectionResult: |
|
|
""" |
|
|
Quick language detection with default settings |
|
|
|
|
|
Arguments: |
|
|
---------- |
|
|
text : Input text |
|
|
|
|
|
**kwargs : Override settings |
|
|
|
|
|
Returns: |
|
|
-------- |
|
|
LanguageDetectionResult object |
|
|
""" |
|
|
detector = LanguageDetector(**kwargs) |
|
|
|
|
|
if detector.use_model: |
|
|
detector.initialize() |
|
|
|
|
|
return detector.detect(text) |
|
|
|
|
|
|
|
|
def is_english(text: str, threshold: float = 0.7) -> bool: |
|
|
""" |
|
|
Quick check if text is English |
|
|
""" |
|
|
detector = LanguageDetector(use_model = True) |
|
|
is_english = detector.is_language(text, Language.ENGLISH, threshold) |
|
|
|
|
|
return is_english |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__all__ = ['Script', |
|
|
'Language', |
|
|
'is_english', |
|
|
'quick_detect', |
|
|
'LanguageDetector', |
|
|
'LanguageDetectionResult', |
|
|
] |
|
|
|