Spaces:

davidtran999
/

hue-portal-backend-v2

Running

App Files Files Community

davidtran999 commited on 7 days ago

Commit

519b145

1 Parent(s): a5fd3d2

Push full code from hue-portal-backend folder

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +63 -1
backend/.DS_Store +0 -0
backend/API_MODE_FIX.md +82 -0
backend/API_MODE_READY.md +108 -0
backend/CHECK_API_MODE.md +47 -0
backend/DUAL_PATH_RAG_README.md +319 -0
backend/Dockerfile +24 -0
backend/FIX_LOCAL_LLM_ISSUE.md +91 -0
backend/GENERAL_CONVERSATION_FIX.md +130 -0
backend/HF_SPACES_NOT_RECEIVING.md +97 -0
backend/LLM_SWITCH_GUIDE.md +211 -0
backend/OPTIMIZE_CHATBOT_PERFORMANCE.md +642 -0
backend/TEST_API_MODE.md +83 -0
backend/WHY_LLM_NOT_CALLED.md +76 -0
backend/chuyenapichatbot.py +0 -0
backend/docs/API_ENDPOINTS.md +152 -0
backend/docs/INTENT_CLASSIFICATION_IMPROVEMENTS.md +87 -0
backend/docs/LEGAL_REFRESH.md +55 -0
backend/docs/OCR_SETUP.md +56 -0
backend/golden_queries_example.json +68 -0
backend/hue_portal/Procfile +0 -0
backend/hue_portal/chatbot/__init__.py +4 -0
backend/hue_portal/chatbot/advanced_features.py +185 -0
backend/hue_portal/chatbot/analytics.py +194 -0
backend/hue_portal/chatbot/apps.py +7 -0
backend/hue_portal/chatbot/cache_monitor.py +195 -0
backend/hue_portal/chatbot/chatbot.py +1092 -0
backend/hue_portal/chatbot/context_manager.py +220 -0
backend/hue_portal/chatbot/dialogue_manager.py +173 -0
backend/hue_portal/chatbot/document_topics.py +74 -0
backend/hue_portal/chatbot/download_progress.py +294 -0
backend/hue_portal/chatbot/dual_path_router.py +274 -0
backend/hue_portal/chatbot/entity_extraction.py +395 -0
backend/hue_portal/chatbot/exact_match_cache.py +61 -0
backend/hue_portal/chatbot/fast_path_handler.py +59 -0
backend/hue_portal/chatbot/legal_guardrails.py +35 -0
backend/hue_portal/chatbot/llm_integration.py +1746 -0
backend/hue_portal/chatbot/llm_integration.py.backup +372 -0
backend/hue_portal/chatbot/llm_integration.py.bak +877 -0
backend/hue_portal/chatbot/query_expansion.py +228 -0
backend/hue_portal/chatbot/router.py +165 -0
backend/hue_portal/chatbot/schemas/legal_answer.rail +63 -0
backend/hue_portal/chatbot/slow_path_handler.py +1392 -0
backend/hue_portal/chatbot/structured_legal.py +276 -0
backend/hue_portal/chatbot/tests/__init__.py +1 -0
backend/hue_portal/chatbot/tests/__pycache__/test_smoke.cpython-310.pyc +0 -0
backend/hue_portal/chatbot/tests/test_intent_keywords.py +29 -0
backend/hue_portal/chatbot/tests/test_intent_training.py +22 -0
backend/hue_portal/chatbot/tests/test_router.py +41 -0
backend/hue_portal/chatbot/tests/test_smoke.py +29 -0

Dockerfile CHANGED Viewed

@@ -54,11 +54,73 @@ fi
 echo "[Docker] Collecting static files..."
 python /app/hue_portal/manage.py collectstatic --noinput || echo "[Docker] Collectstatic failed, continuing..."
 echo "[Docker] Starting gunicorn..."
-exec gunicorn -b 0.0.0.0:7860 --timeout 1800 --graceful-timeout 1800 --worker-class sync hue_portal.hue_portal.wsgi:application
 EOF
 RUN chmod +x /entrypoint.sh
 EXPOSE 7860
 CMD ["/entrypoint.sh"]

 echo "[Docker] Collecting static files..."
 python /app/hue_portal/manage.py collectstatic --noinput || echo "[Docker] Collectstatic failed, continuing..."
+echo "[Docker] Preloading all models to avoid first-request timeout..."
+python -c "
+import os
+import sys
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'hue_portal.hue_portal.settings')
+import django
+django.setup()
+print('[Docker] 🔄 Starting model preload...', flush=True)
+# 1. Preload Embedding Model (BGE-M3)
+try:
+    print('[Docker] 📦 Preloading embedding model (BGE-M3)...', flush=True)
+    from hue_portal.core.embeddings import get_embedding_model
+    embedding_model = get_embedding_model()
+    if embedding_model:
+        print('[Docker] ✅ Embedding model preloaded successfully', flush=True)
+    else:
+        print('[Docker] ⚠️ Embedding model not loaded', flush=True)
+except Exception as e:
+    print(f'[Docker] ⚠️ Embedding model preload failed: {e}', flush=True)
+# 2. Preload LLM Model (llama.cpp)
+llm_provider = os.environ.get('DEFAULT_LLM_PROVIDER') or os.environ.get('LLM_PROVIDER', '')
+if llm_provider.lower() == 'llama_cpp':
+    try:
+        print('[Docker] 📦 Preloading LLM model (llama.cpp)...', flush=True)
+        from hue_portal.chatbot.llm_integration import get_llm_generator
+        llm_gen = get_llm_generator()
+        if llm_gen and hasattr(llm_gen, 'llama_cpp') and llm_gen.llama_cpp:
+            print('[Docker] ✅ LLM model preloaded successfully', flush=True)
+        else:
+            print('[Docker] ⚠️ LLM model not loaded (may load on first request)', flush=True)
+    except Exception as e:
+        print(f'[Docker] ⚠️ LLM model preload failed: {e} (will load on first request)', flush=True)
+else:
+    print(f'[Docker] ⏭️ Skipping LLM preload (provider is {llm_provider or \"not set\"}, not llama_cpp)', flush=True)
+# 3. Preload Reranker Model
+try:
+    print('[Docker] 📦 Preloading reranker model...', flush=True)
+    from hue_portal.core.reranker import get_reranker
+    reranker = get_reranker()
+    if reranker:
+        print('[Docker] ✅ Reranker model preloaded successfully', flush=True)
+    else:
+        print('[Docker] ⚠️ Reranker model not loaded (may load on first request)', flush=True)
+except Exception as e:
+    print(f'[Docker] ⚠️ Reranker preload failed: {e} (will load on first request)', flush=True)
+print('[Docker] ✅ Model preload completed', flush=True)
+" || echo "[Docker] ⚠️ Model preload had errors (models will load on first request)"
 echo "[Docker] Starting gunicorn..."
+# Reduce tokenizers parallelism warnings and risk of fork deadlocks
+export TOKENIZERS_PARALLELISM=false
+# Shorter timeouts to avoid long hangs; adjust if needed
+cd /app/backend && export PYTHONPATH="/app/backend:${PYTHONPATH}" && exec gunicorn -b 0.0.0.0:7860 --timeout 600 --graceful-timeout 600 --worker-class sync --config python:hue_portal.hue_portal.gunicorn_app hue_portal.hue_portal.gunicorn_app:application
 EOF
 RUN chmod +x /entrypoint.sh
 EXPOSE 7860
 CMD ["/entrypoint.sh"]
+EXPOSE 7860
+CMD ["/entrypoint.sh"]
+EXPOSE 7860
+CMD ["/entrypoint.sh"]

backend/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

backend/API_MODE_FIX.md ADDED Viewed

	@@ -0,0 +1,82 @@

+# Sửa lỗi API Mode - HF Spaces không nhận được documents
+## Vấn đề
+Khi backend gọi HF Spaces API, nó chỉ gửi `query` đơn giản, không gửi `prompt` đã build từ documents. Do đó HF Spaces không nhận được thông tin từ documents đã retrieve.
+## Đã sửa
+### 1. `llm_integration.py` - Line 309
+**Trước:**
+```python
+elif self.provider == LLM_PROVIDER_API:
+    result = self._generate_api(query, context)
+```
+**Sau:**
+```python
+elif self.provider == LLM_PROVIDER_API:
+    # For API mode, send the full prompt (with documents) as the message
+    # This ensures HF Spaces receives all context from retrieved documents
+    result = self._generate_api(prompt, context)
+```
+### 2. `llm_integration.py` - `_generate_api()` method
+**Trước:**
+```python
+def _generate_api(self, query: str, context: Optional[List[Dict[str, Any]]] = None) -> Optional[str]:
+    payload = {
+        "message": query,  # Chỉ gửi query đơn giản
+        "reset_session": False
+    }
+```
+**Sau:**
+```python
+def _generate_api(self, prompt: str, context: Optional[List[Dict[str, Any]]] = None) -> Optional[str]:
+    # Send the full prompt (with documents) as the message to HF Spaces
+    payload = {
+        "message": prompt,  # Gửi prompt đầy đủ có documents
+        "reset_session": False
+    }
+```
+### 3. Thêm logging chi tiết
+- Log khi gọi API: `[LLM] 🔗 Calling API: ...`
+- Log payload: `[LLM] 📤 Payload: ...`
+- Log response: `[LLM] 📥 Response status: ...`
+- Log errors chi tiết
+## Cách test
+1. **Restart backend server:**
+```bash
+pkill -f "manage.py runserver"
+cd backend && source venv/bin/activate && cd hue_portal
+python3 manage.py runserver 0.0.0.0:8000
+```
+2. **Test trong UI:**
+- Mở http://localhost:3000
+- Gửi câu hỏi: "Mức phạt vượt đèn đỏ là bao nhiêu?"
+- Xem server logs để thấy:
+  - `[RAG] Using LLM provider: api`
+  - `[LLM] 🔗 Calling API: ...`
+  - `[LLM] 📥 Response status: 200`
+  - `[LLM] ✅ Got message from API`
+3. **Kiểm tra response:**
+- Response phải từ LLM (có văn bản tự nhiên, không phải template)
+- Response phải chứa thông tin từ documents đã retrieve
+## Lưu ý
+- Prompt có thể dài (có documents), nhưng HF Spaces API hỗ trợ prompt dài
+- Nếu timeout, có thể tăng timeout trong `_generate_api()` (hiện tại 60s)
+- Nếu vẫn không hoạt động, kiểm tra:
+  - HF Spaces có đang chạy không
+  - Internet connection
+  - Server logs để xem lỗi cụ thể

backend/API_MODE_READY.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# API Mode - Trạng thái sẵn sàng
+## ✅ Project đã sẵn sàng để test với API mode!
+### Đã hoàn thành:
+1. **Code Integration** ✅
+   - `llm_integration.py` đã có method `_generate_api()`
+   - API mode được support đầy đủ
+   - Error handling và timeout được xử lý
+2. **Configuration** ✅
+   - File `.env` đã được tạo với `LLM_PROVIDER=api`
+   - API URL đã được set: `https://davidtran999-hue-portal-backend.hf.space/api`
+3. **Scripts** ✅
+   - `switch_llm_provider.py` - để switch giữa các providers
+   - `test_api_mode.py` - để test API connection
+### Cách sử dụng:
+#### 1. Kiểm tra cấu hình hiện tại:
+```bash
+python3 switch_llm_provider.py show
+```
+#### 2. Đảm bảo đang dùng API mode:
+```bash
+python3 switch_llm_provider.py api
+```
+#### 3. Test API connection:
+```bash
+python3 test_api_mode.py
+```
+#### 4. Restart Django server:
+```bash
+# Nếu dùng manage.py
+python manage.py runserver
+# Nếu dùng gunicorn
+systemctl restart gunicorn
+# hoặc
+pkill -f gunicorn && gunicorn your_app.wsgi:application
+```
+### Lưu ý:
+1. **API Endpoint phải đang chạy**
+   - Hugging Face Space phải được deploy và running
+   - URL: `https://davidtran999-hue-portal-backend.hf.space/api`
+   - Endpoint: `/api/chatbot/chat/`
+2. **Model Loading Time**
+   - Lần đầu gọi API có thể mất thời gian (model đang load)
+   - Có thể nhận 503 (Service Unavailable) - đây là bình thường
+   - Đợi vài phút rồi thử lại
+3. **Request Format**
+   - API expect: `{"message": "text", "reset_session": false}`
+   - Không cần `session_id` (sẽ được generate tự động)
+### Troubleshooting:
+#### API timeout:
+- Kiểm tra internet connection
+- Kiểm tra Hugging Face Space có đang running không
+- Kiểm tra URL có đúng không
+#### API trả về 503:
+- Model đang loading, đợi vài phút rồi thử lại
+- Đây là bình thường cho lần đầu tiên
+#### API trả về 400:
+- Kiểm tra request format
+- Đảm bảo `message` field có giá trị
+### Test thủ công:
+```python
+import requests
+url = "https://davidtran999-hue-portal-backend.hf.space/api/chatbot/chat/"
+payload = {
+    "message": "Xin chào",
+    "reset_session": False
+}
+response = requests.post(url, json=payload, timeout=60)
+print(f"Status: {response.status_code}")
+print(f"Response: {response.json()}")
+```
+### Kết luận:
+**Project đã sẵn sàng về mặt code!**
+Chỉ cần:
+1. Đảm bảo Hugging Face Space đang chạy
+2. Restart Django server
+3. Test với một câu hỏi đơn giản
+Code sẽ tự động:
+- Gọi API endpoint đúng
+- Xử lý errors
+- Return response message

backend/CHECK_API_MODE.md ADDED Viewed

	@@ -0,0 +1,47 @@

+# Kiểm tra API Mode
+## Vấn đề
+Response hiện tại là template-based, không phải từ LLM API mode.
+## Đã làm
+1. ✅ Cấu hình đã đúng: `LLM_PROVIDER=api`
+2. ✅ Test trực tiếp API mode hoạt động
+3. ✅ Đã thêm logging vào RAG pipeline để debug
+## Cách kiểm tra
+### 1. Kiểm tra server logs
+Khi gửi request, xem logs có:
+- `[RAG] Using LLM provider: api`
+- `[LLM] Generating answer with provider: api`
+- `[LLM] ✅ Answer generated successfully` hoặc error
+### 2. Test trực tiếp
+```bash
+curl -X POST http://localhost:8000/api/chatbot/chat/ \
+  -H "Content-Type: application/json" \
+  -d '{"message": "Mức phạt vượt đèn đỏ là bao nhiêu?", "reset_session": false}'
+```
+### 3. Kiểm tra trong code
+- RAG pipeline gọi `llm.generate_answer()` với `use_llm=True`
+- LLM generator có `provider == "api"`
+- `_generate_api()` được gọi với query
+## Nguyên nhân có thể
+1. **API timeout**: HF Spaces API có thể timeout
+2. **API trả về None**: API có thể trả về None và fallback về template
+3. **LLM không available**: `get_llm_generator()` có thể trả về None
+## Giải pháp
+Nếu API mode không hoạt động:
+1. Kiểm tra Hugging Face Space có đang chạy không
+2. Kiểm tra internet connection
+3. Kiểm tra API URL có đúng không
+4. Xem server logs để biết lỗi cụ thể

backend/DUAL_PATH_RAG_README.md ADDED Viewed

	@@ -0,0 +1,319 @@

+# Dual-Path RAG Architecture
+## Overview
+Dual-Path RAG là kiến trúc tối ưu cho chatbot legal, tách biệt 2 đường xử lý:
+- **Fast Path**: Golden dataset (200 câu phổ biến) → <200ms, 100% accuracy
+- **Slow Path**: Full RAG pipeline → 4-8s, 99.99% accuracy
+## Architecture
+```
+User Query
+    ↓
+Intent Classification
+    ↓
+Dual-Path Router
+    ├─ Keyword Router (exact/fuzzy match)
+    ├─ Semantic Similarity Search (threshold 0.85)
+    └─ LLM Router (optional, for edge cases)
+    ↓
+┌─────────────────┬─────────────────┐
+│   Fast Path     │   Slow Path      │
+│   (<200ms)      │   (4-8s)         │
+│                 │                  │
+│ Golden Dataset  │ Full RAG:        │
+│ - Exact match   │ - Hybrid Search  │
+│ - Fuzzy match   │ - Top 20 docs    │
+│ - Similarity    │ - LLM Generation │
+│                 │ - Guardrails     │
+│ 100% accuracy   │ 99.99% accuracy  │
+└─────────────────┴─────────────────┘
+    ↓
+Response + Routing Log
+```
+## Components
+### 1. Database Models
+**GoldenQuery**: Stores verified queries and responses
+- `query`, `query_normalized`, `query_embedding`
+- `intent`, `response_message`, `response_data`
+- `verified_by`, `usage_count`, `accuracy_score`
+**QueryRoutingLog**: Logs routing decisions for monitoring
+- `route` (fast_path/slow_path)
+- `router_method` (keyword/similarity/llm/default)
+- `response_time_ms`, `similarity_score`
+### 2. Router Components
+**KeywordRouter**: Fast keyword-based matching
+- Exact match (normalized query)
+- Fuzzy match (70% word overlap)
+- ~1-5ms latency
+**DualPathRouter**: Main router with hybrid logic
+- Step 1: Keyword routing (fastest)
+- Step 2: Semantic similarity (threshold 0.85)
+- Step 3: LLM router fallback (optional)
+- Default: Slow Path
+### 3. Path Handlers
+**FastPathHandler**: Returns cached responses from golden dataset
+- Increments usage count
+- Returns verified response instantly
+**SlowPathHandler**: Full RAG pipeline
+- Hybrid search (BM25 + vector)
+- Top 20 documents
+- LLM generation with structured output
+- Auto-save high-quality responses to golden dataset
+## Setup
+### 1. Run Migration
+```bash
+cd backend/hue_portal
+python manage.py migrate core
+```
+### 2. Import Initial Golden Dataset
+```bash
+# Import from JSON file
+python manage.py manage_golden_dataset import --file golden_queries.json --format json
+# Or import from CSV
+python manage.py manage_golden_dataset import --file golden_queries.csv --format csv
+```
+### 3. Generate Embeddings (for semantic search)
+```bash
+# Generate embeddings for all queries
+python manage.py manage_golden_dataset update_embeddings
+# Or for specific query
+python manage.py manage_golden_dataset update_embeddings --query-id 123
+```
+## Management Commands
+### Import Queries
+```bash
+python manage.py manage_golden_dataset import \
+    --file golden_queries.json \
+    --format json \
+    --verify-by legal_expert \
+    --skip-embeddings  # Skip if embeddings will be generated later
+```
+### Verify Query
+```bash
+python manage.py manage_golden_dataset verify \
+    --query-id 123 \
+    --verify-by gpt4 \
+    --accuracy 1.0
+```
+### Update Embeddings
+```bash
+python manage.py manage_golden_dataset update_embeddings \
+    --batch-size 10
+```
+### View Statistics
+```bash
+python manage.py manage_golden_dataset stats
+```
+### Export Dataset
+```bash
+python manage.py manage_golden_dataset export \
+    --file exported_queries.json \
+    --active-only
+```
+### Delete Query
+```bash
+# Soft delete (deactivate)
+python manage.py manage_golden_dataset delete --query-id 123 --soft
+# Hard delete
+python manage.py manage_golden_dataset delete --query-id 123
+```
+## API Endpoints
+### Chat Endpoint (unchanged)
+```
+POST /api/chatbot/chat/
+{
+  "message": "Mức phạt vượt đèn đỏ là bao nhiêu?",
+  "session_id": "optional-uuid",
+  "reset_session": false
+}
+```
+Response includes routing metadata:
+```json
+{
+  "message": "...",
+  "intent": "search_fine",
+  "results": [...],
+  "_source": "fast_path",  // or "slow_path"
+  "_routing": {
+    "path": "fast_path",
+    "method": "keyword",
+    "confidence": 1.0
+  },
+  "_golden_query_id": 123  // if fast_path
+}
+```
+### Analytics Endpoint
+```
+GET /api/chatbot/analytics/?days=7&type=all
+```
+Returns:
+- `routing`: Fast/Slow path statistics
+- `golden_dataset`: Golden dataset stats
+- `performance`: P50/P95/P99 response times
+## Golden Dataset Format
+### JSON Format
+```json
+[
+  {
+    "query": "Mức phạt vượt đèn đỏ là bao nhiêu?",
+    "intent": "search_fine",
+    "response_message": "Mức phạt vượt đèn đỏ là từ 200.000 - 400.000 VNĐ...",
+    "response_data": {
+      "message": "...",
+      "intent": "search_fine",
+      "results": [...],
+      "count": 1
+    },
+    "verified_by": "legal_expert",
+    "accuracy_score": 1.0
+  }
+]
+```
+### CSV Format
+```csv
+query,intent,response_message,response_data
+"Mức phạt vượt đèn đỏ là bao nhiêu?","search_fine","Mức phạt...","{\"message\":\"...\",\"results\":[...]}"
+```
+## Monitoring
+### Routing Statistics
+```python
+from hue_portal.chatbot.analytics import get_routing_stats
+stats = get_routing_stats(days=7)
+print(f"Fast Path: {stats['fast_path_percentage']:.1f}%")
+print(f"Slow Path: {stats['slow_path_percentage']:.1f}%")
+print(f"Fast Path Avg Time: {stats['fast_path_avg_time_ms']:.1f}ms")
+print(f"Slow Path Avg Time: {stats['slow_path_avg_time_ms']:.1f}ms")
+```
+### Golden Dataset Stats
+```python
+from hue_portal.chatbot.analytics import get_golden_dataset_stats
+stats = get_golden_dataset_stats()
+print(f"Active queries: {stats['active_queries']}")
+print(f"Embedding coverage: {stats['embedding_coverage']:.1f}%")
+```
+## Best Practices
+### 1. Building Golden Dataset
+- Start with 50-100 most common queries from logs
+- Verify each response manually or with strong LLM (GPT-4/Claude)
+- Add queries gradually based on usage patterns
+- Target: 200 queries covering 80% of traffic
+### 2. Verification Process
+- **Weekly review**: Check top 20 most-used queries
+- **Monthly audit**: Review all queries for accuracy
+- **Update embeddings**: When adding new queries
+- **Version control**: Track changes with `version` field
+### 3. Tuning Similarity Threshold
+- Default: 0.85 (conservative, high precision)
+- Lower (0.75): More queries go to Fast Path, but risk false matches
+- Higher (0.90): Fewer false matches, but more queries go to Slow Path
+### 4. Auto-Save from Slow Path
+Slow Path automatically saves high-quality responses:
+- Confidence >= 0.95
+- Has results
+- Message length > 50 chars
+- Not already in golden dataset
+Review auto-saved queries weekly and verify before activating.
+## Troubleshooting
+### Fast Path not matching
+1. Check if query is normalized correctly
+2. Verify golden query exists: `GoldenQuery.objects.filter(query_normalized=...)`
+3. Check similarity threshold (may be too high)
+4. Ensure embeddings are generated: `update_embeddings`
+### Slow performance
+1. Check routing logs: `QueryRoutingLog.objects.filter(route='fast_path')`
+2. Verify Fast Path percentage (should be ~80%)
+3. Check embedding model loading time
+4. Monitor database query performance
+### Low accuracy
+1. Review golden dataset verification
+2. Check `accuracy_score` of golden queries
+3. Monitor Slow Path responses for quality
+4. Update golden queries with better responses
+## Expected Performance
+- **Fast Path**: <200ms (target: <100ms)
+- **Slow Path**: 4-8s (full RAG pipeline)
+- **Overall**: 80% queries <200ms, 20% queries 4-8s
+- **Cache Hit Rate**: 75-85% (Fast Path usage)
+## Next Steps
+1. Import initial 200 common queries
+2. Generate embeddings for all queries
+3. Monitor routing statistics for 1 week
+4. Tune similarity threshold based on metrics
+5. Expand golden dataset based on usage patterns

backend/Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
+WORKDIR /app
+# System dependencies (OCR + build essentials)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        tesseract-ocr \
+        tesseract-ocr-eng \
+        tesseract-ocr-vie \
+        libpoppler-cpp-dev \
+        pkg-config \
+        libgl1 && \
+    rm -rf /var/lib/apt/lists/*
+COPY backend/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+COPY backend /app
+CMD ["gunicorn", "-b", "0.0.0.0:8000", "hue_portal.hue_portal.wsgi:application"]

backend/FIX_LOCAL_LLM_ISSUE.md ADDED Viewed

	@@ -0,0 +1,91 @@

+# Fix: Server đang dùng Local LLM thay vì API Mode
+## Vấn đề
+Khi test chat trên web, server đang chạy local LLM trên máy thay vì gọi HF Spaces API.
+## Nguyên nhân
+1. **Global instance cache:** `get_llm_generator()` sử dụng global instance `_llm_generator` chỉ tạo một lần
+2. **Server start với env cũ:** Nếu server start với `LLM_PROVIDER=local`, instance sẽ giữ provider=local
+3. **Không reload khi env thay đổi:** Khi `.env` được update, server không tự động reload instance
+## Đã sửa
+### File: `backend/hue_portal/chatbot/llm_integration.py`
+**Trước:**
+```python
+_llm_generator: Optional[LLMGenerator] = None
+def get_llm_generator() -> Optional[LLMGenerator]:
+    global _llm_generator
+    if _llm_generator is None:
+        _llm_generator = LLMGenerator()
+    return _llm_generator if _llm_generator.is_available() else None
+```
+**Sau:**
+```python
+_llm_generator: Optional[LLMGenerator] = None
+_last_provider: Optional[str] = None
+def get_llm_generator() -> Optional[LLMGenerator]:
+    """Get or create LLM generator instance.
+    Recreates instance if provider changed (e.g., from local to api).
+    """
+    global _llm_generator, _last_provider
+    # Get current provider from env
+    current_provider = os.environ.get("LLM_PROVIDER", LLM_PROVIDER_NONE).lower()
+    # Recreate if provider changed or instance doesn't exist
+    if _llm_generator is None or _last_provider != current_provider:
+        _llm_generator = LLMGenerator()
+        _last_provider = current_provider
+        print(f"[LLM] 🔄 Recreated LLM generator with provider: {current_provider}", flush=True)
+    return _llm_generator if _llm_generator.is_available() else None
+```
+## Cách test
+1. **Đảm bảo `.env` có đúng config:**
+```bash
+cd backend
+cat .env | grep LLM
+# Should show:
+# LLM_PROVIDER=api
+# HF_API_BASE_URL=https://davidtran999-hue-portal-backend.hf.space/api
+```
+2. **Restart server:**
+```bash
+pkill -f "manage.py runserver"
+cd backend && source venv/bin/activate && cd hue_portal
+python3 manage.py runserver 0.0.0.0:8000
+```
+3. **Test trong web UI:**
+- Mở http://localhost:3000/chat
+- Gửi câu hỏi: "Mức phạt vượt đèn đỏ là bao nhiêu?"
+- Xem server logs để thấy:
+  - `[LLM] 🔄 Recreated LLM generator with provider: api`
+  - `[RAG] Using LLM provider: api`
+  - `[LLM] 🔗 Calling API: https://davidtran999-hue-portal-backend.hf.space/api/chatbot/chat/`
+4. **Kiểm tra response:**
+- Response phải từ HF Spaces API (có văn bản tự nhiên, không phải template)
+- KHÔNG thấy logs về local model loading
+## Lưu ý
+- Server sẽ tự động recreate LLM instance khi provider thay đổi
+- Không cần restart server khi thay đổi `.env` (nhưng nên restart để đảm bảo)
+- Nếu vẫn dùng local LLM, kiểm tra:
+  - `.env` có `LLM_PROVIDER=api` không
+  - Server có load đúng `.env` không
+  - Xem server logs để biết provider nào đang được dùng

backend/GENERAL_CONVERSATION_FIX.md ADDED Viewed

	@@ -0,0 +1,130 @@

+# Sửa Chatbot để Hỗ trợ General Conversation
+## Vấn đề
+Chatbot không trả lời được như một chatbot AI thông thường vì:
+1. **Chỉ gọi LLM khi có documents** → Không thể trả lời general queries
+2. **Trả về error message ngay khi không có documents** → Không cho LLM cơ hội trả lời
+## Giải pháp đã áp dụng
+### 1. Sửa `rag.py` - Cho phép LLM trả lời ngay cả khi không có documents
+**File:** `backend/hue_portal/core/rag.py`
+**Thay đổi:**
+- Trước: Trả về error message ngay khi không có documents
+- Sau: Gọi LLM ngay cả khi không có documents (general conversation mode)
+```python
+# Trước:
+if not documents:
+    return error_message  # ← Không gọi LLM
+# Sau:
+# Gọi LLM trước (ngay cả khi không có documents)
+if use_llm:
+    llm_answer = llm.generate_answer(query, context=context, documents=documents if documents else [])
+    if llm_answer:
+        return llm_answer
+# Chỉ trả về error nếu không có LLM và không có documents
+if not documents:
+    return error_message
+```
+### 2. Sửa `llm_integration.py` - Prompt cho general conversation
+**File:** `backend/hue_portal/chatbot/llm_integration.py`
+**Thay đổi:**
+- Nếu có documents → Yêu cầu trả lời dựa trên documents (strict mode)
+- Nếu không có documents → Cho phép general conversation (friendly mode)
+```python
+if documents:
+    # Strict mode: chỉ trả lời dựa trên documents
+    prompt_parts.extend([...])
+else:
+    # General conversation mode
+    prompt_parts.extend([
+        "- Trả lời câu hỏi một cách tự nhiên và hữu ích như một chatbot AI thông thường",
+        "- Nếu câu hỏi liên quan đến pháp luật nhưng không có thông tin, hãy nói rõ",
+        ...
+    ])
+```
+### 3. Sửa `rag_pipeline` - Luôn gọi generate_answer_template
+**File:** `backend/hue_portal/core/rag.py`
+**Thay đổi:**
+- Trước: Trả về error ngay khi không có documents
+- Sau: Luôn gọi `generate_answer_template` để cho LLM cơ hội trả lời
+```python
+# Trước:
+if not documents:
+    return {'answer': error_message, ...}  # ← Không gọi LLM
+# Sau:
+# Luôn gọi generate_answer_template (sẽ gọi LLM nếu có)
+answer = generate_answer_template(query, documents, content_type, context=context, use_llm=use_llm)
+```
+### 4. Sửa `chatbot.py` - Sử dụng answer từ LLM ngay cả khi count=0
+**File:** `backend/hue_portal/chatbot/chatbot.py`
+**Thay đổi:**
+- Trước: Chỉ sử dụng RAG result nếu `count > 0`
+- Sau: Sử dụng answer từ LLM ngay cả khi `count = 0`
+```python
+# Trước:
+if rag_result["count"] > 0 and rag_result["confidence"] >= confidence:
+    # Sử dụng answer
+# Sau:
+if rag_result.get("answer") and (rag_result["count"] > 0 or rag_result.get("answer", "").strip()):
+    # Sử dụng answer (kể cả khi count=0)
+```
+## Kết quả
+✅ **LLM được gọi ngay cả khi không có documents**
+- Logs cho thấy: `[RAG] Using LLM provider: api`
+- Logs cho thấy: `[LLM] 🔗 Calling API: ...`
+⚠️ **API trả về 500 error**
+- Có thể do HF Spaces API đang gặp lỗi
+- Hoặc prompt quá dài
+- Cần kiểm tra HF Spaces logs
+## Cách test
+1. **Test với general query:**
+```bash
+curl -X POST http://localhost:8000/api/chatbot/chat/ \
+  -H "Content-Type: application/json" \
+  -d '{"message":"mấy giờ rồi","reset_session":false}'
+```
+2. **Xem logs:**
+```bash
+tail -f /tmp/django_general_conv.log | grep -E "\[RAG\]|\[LLM\]"
+```
+3. **Kiểm tra LLM có được gọi:**
+- Tìm: `[RAG] Using LLM provider: api`
+- Tìm: `[LLM] 🔗 Calling API: ...`
+## Lưu ý
+- **API mode cần HF Spaces hoạt động** → Nếu API trả về 500, cần kiểm tra HF Spaces
+- **Local mode** sẽ hoạt động tốt hơn nếu có GPU
+- **General conversation** chỉ hoạt động khi LLM available

backend/HF_SPACES_NOT_RECEIVING.md ADDED Viewed

	@@ -0,0 +1,97 @@

+# Vấn đề: HF Spaces không nhận được request từ project local
+## Phân tích
+Từ logs HF Spaces:
+- HF Spaces đang load **local model** (Qwen/Qwen2.5-7B-Instruct)
+- HF Spaces **KHÔNG** nhận được request từ project local
+- Khi project local gọi API, response vẫn là **template-based**
+## Nguyên nhân có thể
+1. **LLM không được gọi khi có documents:**
+   - RAG pipeline có `use_llm=True` nhưng LLM generation có thể fail
+   - Fallback về template khi LLM fail
+2. **LLM generation fail:**
+   - API timeout
+   - API trả về None
+   - Error trong quá trình generate
+3. **Server local không load đúng env:**
+   - Server khởi động trước khi `.env` được update
+   - Cần restart server
+## Giải pháp
+### 1. Đảm bảo server load đúng env
+```bash
+# Stop server
+pkill -f "manage.py runserver"
+# Start lại với env mới
+cd backend && source venv/bin/activate && cd hue_portal
+python3 manage.py runserver 0.0.0.0:8000
+```
+### 2. Kiểm tra logs khi test
+Khi gửi request với documents, xem logs có:
+- `[RAG] Using LLM provider: api`
+- `[LLM] 🔗 Calling API: ...`
+- `[LLM] 📥 Response status: 200`
+Nếu không thấy logs này, có nghĩa là:
+- LLM không được gọi
+- Hoặc LLM generation fail trước khi gọi API
+### 3. Test trực tiếp API mode
+```bash
+cd backend && source venv/bin/activate
+python3 -c "
+import os
+os.environ['LLM_PROVIDER'] = 'api'
+os.environ['HF_API_BASE_URL'] = 'https://davidtran999-hue-portal-backend.hf.space/api'
+import sys
+sys.path.insert(0, 'hue_portal')
+from chatbot.llm_integration import LLMGenerator, LLM_PROVIDER_API
+llm = LLMGenerator(provider=LLM_PROVIDER_API)
+result = llm._generate_api('Test prompt with documents')
+print(f'Result: {result}')
+"
+```
+## Debug steps
+1. **Kiểm tra env variables:**
+```bash
+cd backend && cat .env | grep LLM
+```
+2. **Restart server:**
+```bash
+pkill -f "manage.py runserver"
+cd backend && source venv/bin/activate && cd hue_portal
+python3 manage.py runserver 0.0.0.0:8000
+```
+3. **Test với câu hỏi có documents:**
+```bash
+curl -X POST http://localhost:8000/api/chatbot/chat/ \
+  -H "Content-Type: application/json" \
+  -d '{"message": "Mức phạt vượt đèn đỏ là bao nhiêu?", "reset_session": false}'
+```
+4. **Xem server logs:**
+- Tìm `[RAG]` logs
+- Tìm `[LLM]` logs
+- Tìm error messages
+## Lưu ý
+- HF Spaces logs cho thấy nó đang dùng **local model**, không phải API mode
+- Điều này có nghĩa là HF Spaces đang chạy độc lập, không nhận request từ project local
+- Project local cần gọi HF Spaces API để nhận response từ model trên HF Spaces

backend/LLM_SWITCH_GUIDE.md ADDED Viewed

	@@ -0,0 +1,211 @@

+# Hướng dẫn Switch LLM Provider
+> **Mặc định kể từ bản cập nhật này, chatbot sẽ dùng local model Qwen/Qwen2.5-7B-Instruct (8-bit) nếu bạn không cấu hình `LLM_PROVIDER`.**
+> Bạn có thể dùng script bên dưới để chuyển sang API/OpenAI/... bất kỳ lúc nào.
+Script để thay đổi LLM provider linh hoạt giữa local model, API mode, và các providers khác.
+## Cách sử dụng
+### Method 1: Python Script (Chi tiết)
+```bash
+# Xem cấu hình hiện tại
+python3 switch_llm_provider.py show
+# Switch sang local model
+python3 switch_llm_provider.py local
+# Switch sang local với custom model
+python3 switch_llm_provider.py local --model Qwen/Qwen2.5-14B-Instruct --device cuda --8bit
+# Switch sang API mode
+python3 switch_llm_provider.py api
+# Switch sang API với custom URL
+python3 switch_llm_provider.py api --url https://custom-api.hf.space/api
+# Switch sang OpenAI
+python3 switch_llm_provider.py openai
+# Switch sang Anthropic
+python3 switch_llm_provider.py anthropic
+# Switch sang Ollama
+python3 switch_llm_provider.py ollama
+# Tắt LLM (chỉ dùng template)
+python3 switch_llm_provider.py none
+```
+### Method 2: Shell Script (Nhanh)
+```bash
+# Xem cấu hình hiện tại
+./llm_switch.sh
+# Switch sang local
+./llm_switch.sh local
+# Switch sang API
+./llm_switch.sh api
+# Switch sang OpenAI
+./llm_switch.sh openai
+# Tắt LLM
+./llm_switch.sh none
+```
+## Các Providers hỗ trợ
+### 1. Local Model (`local`)
+Sử dụng local Hugging Face model trên máy của bạn.
+**Cấu hình:**
+```bash
+LLM_PROVIDER=local
+LOCAL_MODEL_PATH=Qwen/Qwen2.5-7B-Instruct
+LOCAL_MODEL_DEVICE=cuda  # hoặc cpu, auto
+LOCAL_MODEL_8BIT=true     # hoặc false
+LOCAL_MODEL_4BIT=false    # hoặc true
+```
+**Ví dụ:**
+```bash
+# 7B model với 8-bit quantization
+python3 switch_llm_provider.py local --model Qwen/Qwen2.5-7B-Instruct --device cuda --8bit
+# 14B model với 4-bit quantization
+python3 switch_llm_provider.py local --model Qwen/Qwen2.5-14B-Instruct --device cuda --4bit
+```
+### 2. API Mode (`api`)
+Gọi API của Hugging Face Spaces.
+**Cấu hình:**
+```bash
+LLM_PROVIDER=api
+HF_API_BASE_URL=https://davidtran999-hue-portal-backend.hf.space/api
+```
+**Ví dụ:**
+```bash
+# Sử dụng default API URL
+python3 switch_llm_provider.py api
+# Sử dụng custom API URL
+python3 switch_llm_provider.py api --url https://your-custom-api.hf.space/api
+```
+### 3. OpenAI (`openai`)
+Sử dụng OpenAI API.
+**Cấu hình:**
+```bash
+LLM_PROVIDER=openai
+OPENAI_API_KEY=your-api-key-here
+```
+**Ví dụ:**
+```bash
+python3 switch_llm_provider.py openai
+```
+### 4. Anthropic (`anthropic`)
+Sử dụng Anthropic Claude API.
+**Cấu hình:**
+```bash
+LLM_PROVIDER=anthropic
+ANTHROPIC_API_KEY=your-api-key-here
+```
+**Ví dụ:**
+```bash
+python3 switch_llm_provider.py anthropic
+```
+### 5. Ollama (`ollama`)
+Sử dụng Ollama local server.
+**Cấu hình:**
+```bash
+LLM_PROVIDER=ollama
+OLLAMA_BASE_URL=http://localhost:11434
+OLLAMA_MODEL=qwen2.5:7b
+```
+**Ví dụ:**
+```bash
+python3 switch_llm_provider.py ollama
+```
+### 6. None (`none`)
+Tắt LLM, chỉ sử dụng template-based generation.
+**Ví dụ:**
+```bash
+python3 switch_llm_provider.py none
+```
+## Lưu ý quan trọng
+1. **Restart Server**: Sau khi thay đổi provider, cần restart Django server để áp dụng:
+   ```bash
+   # Nếu dùng manage.py
+   python manage.py runserver
+   # Nếu dùng gunicorn
+   systemctl restart gunicorn
+   # hoặc
+   pkill -f gunicorn && gunicorn ...
+   ```
+2. **Local Model Requirements**:
+   - Cần GPU với đủ VRAM (7B 8-bit: ~7GB, 14B 4-bit: ~8GB)
+   - Cần cài đặt: `transformers`, `accelerate`, `bitsandbytes`
+   - Model sẽ được download tự động lần đầu
+3. **API Mode**:
+   - Cần internet connection
+   - API endpoint phải đang hoạt động
+   - Có thể có rate limits
+4. **Environment Variables**:
+   - Script sẽ tự động tạo/update file `.env` trong thư mục `backend/`
+   - Nếu không có file `.env`, script sẽ tạo mới
+## Troubleshooting
+### Local model không load được
+- Kiểm tra GPU có đủ VRAM không
+- Thử model nhỏ hơn: `Qwen/Qwen2.5-1.5B-Instruct`
+- Thử dùng CPU: `--device cpu` (chậm hơn)
+### API mode không hoạt động
+- Kiểm tra internet connection
+- Kiểm tra API URL có đúng không
+- Kiểm tra API endpoint có đang chạy không
+### Script không tìm thấy .env file
+- Script sẽ tự động tạo file `.env` mới
+- Hoặc tạo thủ công: `touch backend/.env`
+## Examples
+### Development: Dùng API mode (nhanh, không cần GPU)
+```bash
+./llm_switch.sh api
+```
+### Production: Dùng local model (tốt nhất, không tốn API cost)
+```bash
+./llm_switch.sh local --model Qwen/Qwen2.5-7B-Instruct --device cuda --8bit
+```
+### Testing: Tắt LLM (chỉ template)
+```bash
+./llm_switch.sh none
+```

backend/OPTIMIZE_CHATBOT_PERFORMANCE.md ADDED Viewed

	@@ -0,0 +1,642 @@

+# Tối ưu Tốc độ và Độ chính xác Chatbot
+Ngày tạo: 2025-01-27
+## 1. Phân tích Bottlenecks hiện tại
+### 1.1 Intent Classification
+**Vấn đề:**
+- Loop qua nhiều keywords mỗi lần (fine_keywords: 9 items, fine_single_words: 7 items)
+- Tính `_remove_accents()` nhiều lần cho cùng keyword
+- Không có compiled regex patterns
+**Impact:** ~5-10ms mỗi query
+### 1.2 Search Pipeline
+**Vấn đề:**
+- `list(queryset)` - Load TẤT CẢ objects vào memory trước khi search
+- TF-IDF vectorization cho toàn bộ dataset mỗi lần
+- Không có early exit khi tìm thấy kết quả tốt
+- Query expansion query database mỗi lần
+**Impact:** ~100-500ms cho dataset lớn
+### 1.3 LLM Generation
+**Vấn đề:**
+- Prompt được build lại mỗi lần (không cache)
+- Không có streaming response
+- max_new_tokens=150 (OK) nhưng có thể tối ưu thêm
+- Không cache generated responses
+**Impact:** ~1-5s cho local model, ~2-10s cho API
+### 1.4 Không có Response Caching
+**Vấn đề:**
+- Cùng query được xử lý lại từ đầu
+- Search results không được cache
+- Intent classification không được cache
+**Impact:** ~100-500ms cho duplicate queries
+## 2. Tối ưu Intent Classification
+### 2.1 Pre-compile Keyword Patterns
+```python
+# backend/hue_portal/core/chatbot.py
+import re
+from functools import lru_cache
+class Chatbot:
+    def __init__(self):
+        self.intent_classifier = None
+        self.vectorizer = None
+        # Pre-compile keyword patterns
+        self._compile_keyword_patterns()
+        self._train_classifier()
+    def _compile_keyword_patterns(self):
+        """Pre-compile regex patterns for faster matching."""
+        # Fine keywords (multi-word first, then single)
+        self.fine_patterns_multi = [
+            re.compile(r'\b' + re.escape(kw) + r'\b', re.IGNORECASE)
+            for kw in ["mức phạt", "vi phạm", "đèn đỏ", "nồng độ cồn",
+                      "mũ bảo hiểm", "tốc độ", "bằng lái", "vượt đèn"]
+        ]
+        self.fine_patterns_single = [
+            re.compile(r'\b' + re.escape(kw) + r'\b', re.IGNORECASE)
+            for kw in ["phạt", "vượt", "đèn", "mức"]
+        ]
+        # Pre-compute accent-free versions
+        self.fine_keywords_ascii = [self._remove_accents(kw) for kw in
+                                    ["mức phạt", "vi phạm", "đèn đỏ", ...]]
+        # Procedure, Office, Advisory patterns...
+        # Similar pattern compilation
+    @lru_cache(maxsize=1000)
+    def classify_intent(self, query: str) -> Tuple[str, float]:
+        """Cached intent classification."""
+        query_lower = query.lower().strip()
+        # Fast path: Check compiled patterns
+        for pattern in self.fine_patterns_multi:
+            if pattern.search(query_lower):
+                return ("search_fine", 0.95)
+        # ... rest of logic
+```
+**Lợi ích:**
+- Giảm ~50% thời gian intent classification
+- Cache kết quả cho duplicate queries
+### 2.2 Early Exit Strategy
+```python
+def _keyword_based_intent(self, query: str) -> Tuple[str, float]:
+    query_lower = query.lower().strip()
+    # Fast path: Check most common intents first
+    # Fine queries are most common → check first
+    if any(pattern.search(query_lower) for pattern in self.fine_patterns_multi):
+        return ("search_fine", 0.95)
+    # Early exit for very short queries (likely greeting)
+    if len(query.split()) <= 2:
+        if any(greeting in query_lower for greeting in ["xin chào", "chào", "hello"]):
+            return ("greeting", 0.9)
+    # ... rest
+```
+## 3. Tối ưu Search Pipeline
+### 3.1 Limit QuerySet trước khi Load
+```python
+# backend/hue_portal/core/search_ml.py
+def search_with_ml(queryset, query, text_fields, top_k=20, min_score=0.1, use_hybrid=True):
+    if not query:
+        return queryset[:top_k]
+    # OPTIMIZATION: Limit queryset early for large datasets
+    # Only search in first N records if dataset is huge
+    MAX_SEARCH_CANDIDATES = 1000
+    total_count = queryset.count()
+    if total_count > MAX_SEARCH_CANDIDATES:
+        # Use database-level filtering first
+        # Try exact match on primary field first
+        primary_field = text_fields[0] if text_fields else None
+        if primary_field:
+            exact_matches = queryset.filter(
+                **{f"{primary_field}__icontains": query}
+            )[:top_k * 2]
+            if exact_matches.count() >= top_k:
+                # We have enough exact matches, return them
+                return exact_matches[:top_k]
+        # Limit candidates for ML search
+        queryset = queryset[:MAX_SEARCH_CANDIDATES]
+    # Continue with existing search logic...
+```
+### 3.2 Cache Search Results
+```python
+# backend/hue_portal/core/search_ml.py
+from functools import lru_cache
+import hashlib
+import json
+def _get_query_hash(query: str, model_name: str, text_fields: tuple) -> str:
+    """Generate hash for query caching."""
+    key = f"{query}|{model_name}|{':'.join(text_fields)}"
+    return hashlib.md5(key.encode()).hexdigest()
+# Cache search results for 1 hour
+@lru_cache(maxsize=500)
+def _cached_search(query_hash: str, queryset_ids: tuple, top_k: int):
+    """Cached search results."""
+    # This will be called with actual queryset in wrapper
+    pass
+def search_with_ml(queryset, query, text_fields, top_k=20, min_score=0.1, use_hybrid=True):
+    # Check cache first
+    query_hash = _get_query_hash(query, queryset.model.__name__, tuple(text_fields))
+    # Try to get from cache (if queryset hasn't changed)
+    # Note: Full caching requires tracking queryset state
+    # ... existing search logic
+```
+### 3.3 Optimize TF-IDF Calculation
+```python
+# Pre-compute TF-IDF vectors for common queries
+# Use incremental TF-IDF instead of recalculating
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+class CachedTfidfVectorizer:
+    """TF-IDF vectorizer with caching."""
+    def __init__(self):
+        self.vectorizer = None
+        self.doc_vectors = None
+        self.doc_ids = None
+    def fit_transform_cached(self, documents: List[str], doc_ids: List[int]):
+        """Fit and cache document vectors."""
+        if self.doc_ids == tuple(doc_ids):
+            # Same documents, reuse vectors
+            return self.doc_vectors
+        # New documents, recompute
+        self.vectorizer = TfidfVectorizer(
+            analyzer='word',
+            ngram_range=(1, 2),
+            min_df=1,
+            max_df=0.95,
+            lowercase=True
+        )
+        self.doc_vectors = self.vectorizer.fit_transform(documents)
+        self.doc_ids = tuple(doc_ids)
+        return self.doc_vectors
+```
+### 3.4 Early Exit khi có Exact Match
+```python
+def search_with_ml(queryset, query, text_fields, top_k=20, min_score=0.1, use_hybrid=True):
+    # OPTIMIZATION: Check exact matches first (fastest)
+    query_normalized = normalize_text(query)
+    # Try exact match on primary field
+    primary_field = text_fields[0] if text_fields else None
+    if primary_field:
+        exact_qs = queryset.filter(**{f"{primary_field}__iexact": query})
+        if exact_qs.exists():
+            # Found exact match, return immediately
+            return exact_qs[:top_k]
+        # Try case-insensitive contains (faster than ML)
+        contains_qs = queryset.filter(**{f"{primary_field}__icontains": query})
+        if contains_qs.count() <= top_k * 2:
+            # Small result set, return directly
+            return contains_qs[:top_k]
+    # Only use ML search if no good exact matches
+    # ... existing ML search logic
+```
+## 4. Tối ưu LLM Generation
+### 4.1 Prompt Caching
+```python
+# backend/hue_portal/chatbot/llm_integration.py
+from functools import lru_cache
+import hashlib
+class LLMGenerator:
+    def __init__(self, provider: Optional[str] = None):
+        self.provider = provider or LLM_PROVIDER
+        self.prompt_cache = {}  # Cache prompts by hash
+        self.response_cache = {}  # Cache responses
+    def _get_prompt_hash(self, query: str, documents: List[Any]) -> str:
+        """Generate hash for prompt caching."""
+        doc_ids = [getattr(doc, 'id', None) for doc in documents[:5]]
+        key = f"{query}|{doc_ids}"
+        return hashlib.md5(key.encode()).hexdigest()
+    def generate_answer(self, query: str, context: Optional[List[Dict]], documents: Optional[List[Any]]):
+        if not self.is_available():
+            return None
+        # Check cache first
+        prompt_hash = self._get_prompt_hash(query, documents or [])
+        if prompt_hash in self.response_cache:
+            cached_response = self.response_cache[prompt_hash]
+            # Check if cache is still valid (e.g., < 1 hour old)
+            if cached_response.get('timestamp', 0) > time.time() - 3600:
+                return cached_response['response']
+        # Build prompt (may be cached)
+        prompt = self._build_prompt(query, context, documents)
+        response = self._generate_from_prompt(prompt, context=context)
+        # Cache response
+        if response:
+            self.response_cache[prompt_hash] = {
+                'response': response,
+                'timestamp': time.time()
+            }
+        return response
+```
+### 4.2 Optimize Local Model Generation
+```python
+def _generate_local(self, prompt: str) -> Optional[str]:
+    # OPTIMIZATION: Use faster generation parameters
+    with torch.no_grad():
+        outputs = self.local_model.generate(
+            **inputs,
+            max_new_tokens=100,  # Reduced from 150
+            temperature=0.5,  # Lower for faster generation
+            top_p=0.8,  # Lower top_p
+            do_sample=False,  # Greedy decoding (faster)
+            use_cache=True,
+            pad_token_id=self.local_tokenizer.eos_token_id,
+            repetition_penalty=1.1,
+            # OPTIMIZATION: Early stopping
+            eos_token_id=self.local_tokenizer.eos_token_id,
+        )
+```
+### 4.3 Streaming Response (for better UX)
+```python
+# For API endpoints, support streaming
+def generate_answer_streaming(self, query: str, context, documents):
+    """Generate answer with streaming for better UX."""
+    if self.provider == LLM_PROVIDER_LOCAL:
+        # Use generate with stream=True
+        for token in self._generate_local_streaming(prompt):
+            yield token
+    elif self.provider == LLM_PROVIDER_OPENAI:
+        # Use OpenAI streaming API
+        for chunk in self.client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": prompt}],
+            stream=True
+        ):
+            yield chunk.choices[0].delta.content
+```
+## 5. Response Caching Strategy
+### 5.1 Multi-level Caching
+```python
+# backend/hue_portal/core/cache_utils.py
+from functools import lru_cache
+from django.core.cache import cache
+import hashlib
+import json
+class ChatbotCache:
+    """Multi-level caching for chatbot responses."""
+    CACHE_TIMEOUT = 3600  # 1 hour
+    @staticmethod
+    def get_cache_key(query: str, intent: str, session_id: str = None) -> str:
+        """Generate cache key."""
+        key_parts = [query.lower().strip(), intent]
+        if session_id:
+            key_parts.append(session_id)
+        key_str = "|".join(key_parts)
+        return f"chatbot:{hashlib.md5(key_str.encode()).hexdigest()}"
+    @staticmethod
+    def get_cached_response(query: str, intent: str, session_id: str = None):
+        """Get cached response."""
+        cache_key = ChatbotCache.get_cache_key(query, intent, session_id)
+        return cache.get(cache_key)
+    @staticmethod
+    def set_cached_response(query: str, intent: str, response: dict, session_id: str = None):
+        """Cache response."""
+        cache_key = ChatbotCache.get_cache_key(query, intent, session_id)
+        cache.set(cache_key, response, ChatbotCache.CACHE_TIMEOUT)
+    @staticmethod
+    def get_cached_search_results(query: str, model_name: str, text_fields: tuple):
+        """Get cached search results."""
+        key = f"search:{hashlib.md5(f'{query}|{model_name}|{text_fields}'.encode()).hexdigest()}"
+        return cache.get(key)
+    @staticmethod
+    def set_cached_search_results(query: str, model_name: str, text_fields: tuple, results):
+        """Cache search results."""
+        key = f"search:{hashlib.md5(f'{query}|{model_name}|{text_fields}'.encode()).hexdigest()}"
+        cache.set(key, results, ChatbotCache.CACHE_TIMEOUT)
+```
+### 5.2 Integrate vào Chatbot
+```python
+# backend/hue_portal/core/chatbot.py
+from .cache_utils import ChatbotCache
+class Chatbot:
+    def generate_response(self, query: str, session_id: str = None) -> Dict[str, Any]:
+        query = query.strip()
+        # Classify intent
+        intent, confidence = self.classify_intent(query)
+        # Check cache first
+        cached_response = ChatbotCache.get_cached_response(query, intent, session_id)
+        if cached_response:
+            return cached_response
+        # ... existing logic
+        # Cache response before returning
+        response = {
+            "message": message,
+            "intent": intent,
+            "confidence": confidence,
+            "results": search_result["results"],
+            "count": search_result["count"]
+        }
+        ChatbotCache.set_cached_response(query, intent, response, session_id)
+        return response
+```
+## 6. Tối ưu Query Expansion
+### 6.1 Cache Synonyms
+```python
+# backend/hue_portal/core/search_ml.py
+from django.core.cache import cache
+@lru_cache(maxsize=1)
+def get_all_synonyms():
+    """Get all synonyms (cached)."""
+    return list(Synonym.objects.all())
+def expand_query_with_synonyms(query: str) -> List[str]:
+    """Expand query using cached synonyms."""
+    query_normalized = normalize_text(query)
+    expanded = [query_normalized]
+    # Use cached synonyms
+    synonyms = get_all_synonyms()
+    for synonym in synonyms:
+        keyword = normalize_text(synonym.keyword)
+        alias = normalize_text(synonym.alias)
+        if keyword in query_normalized:
+            expanded.append(query_normalized.replace(keyword, alias))
+        if alias in query_normalized:
+            expanded.append(query_normalized.replace(alias, keyword))
+    return list(set(expanded))
+```
+## 7. Database Query Optimization
+### 7.1 Use select_related / prefetch_related
+```python
+# backend/hue_portal/core/chatbot.py
+def search_by_intent(self, intent: str, query: str, limit: int = 5):
+    if intent == "search_fine":
+        qs = Fine.objects.all().select_related('decree')  # If has FK
+        # ... rest
+    elif intent == "search_legal":
+        qs = LegalSection.objects.all().select_related('document')
+        # ... rest
+```
+### 7.2 Add Database Indexes
+```python
+# backend/hue_portal/core/models.py
+class Fine(models.Model):
+    name = models.CharField(max_length=500, db_index=True)  # Add index
+    code = models.CharField(max_length=50, db_index=True)   # Add index
+    class Meta:
+        indexes = [
+            models.Index(fields=['name', 'code']),
+            models.Index(fields=['min_fine', 'max_fine']),
+        ]
+```
+## 8. Tối ưu Frontend
+### 8.1 Debounce Search Input
+```typescript
+// frontend/src/pages/Chat.tsx
+const [input, setInput] = useState('')
+const debouncedInput = useDebounce(input, 300)  // Wait 300ms
+useEffect(() => {
+  if (debouncedInput) {
+    // Trigger search suggestions
+  }
+}, [debouncedInput])
+```
+### 8.2 Optimistic UI Updates
+```typescript
+const handleSend = async (messageText?: string) => {
+  // Show message immediately (optimistic)
+  setMessages(prev => [...prev, {
+    role: 'user',
+    content: textToSend,
+    timestamp: new Date()
+  }])
+  // Then fetch response
+  const response = await chat(textToSend, sessionId)
+  // Update with actual response
+}
+```
+## 9. Monitoring & Metrics
+### 9.1 Add Performance Logging
+```python
+# backend/hue_portal/chatbot/views.py
+import time
+from django.utils import timezone
+@api_view(["POST"])
+def chat(request: Request) -> Response:
+    start_time = time.time()
+    # ... existing logic
+    # Log performance metrics
+    elapsed = time.time() - start_time
+    logger.info(f"[PERF] Chat response time: {elapsed:.3f}s | Intent: {intent} | Results: {count}")
+    # Track slow queries
+    if elapsed > 2.0:
+        logger.warning(f"[SLOW] Query took {elapsed:.3f}s: {message[:100]}")
+    return Response(response)
+```
+### 9.2 Track Cache Hit Rate
+```python
+class ChatbotCache:
+    cache_hits = 0
+    cache_misses = 0
+    @staticmethod
+    def get_cached_response(query: str, intent: str, session_id: str = None):
+        cached = cache.get(ChatbotCache.get_cache_key(query, intent, session_id))
+        if cached:
+            ChatbotCache.cache_hits += 1
+            return cached
+        ChatbotCache.cache_misses += 1
+        return None
+    @staticmethod
+    def get_cache_stats():
+        total = ChatbotCache.cache_hits + ChatbotCache.cache_misses
+        if total == 0:
+            return {"hit_rate": 0, "hits": 0, "misses": 0}
+        return {
+            "hit_rate": ChatbotCache.cache_hits / total,
+            "hits": ChatbotCache.cache_hits,
+            "misses": ChatbotCache.cache_misses
+        }
+```
+## 10. Expected Performance Improvements
+| Optimization | Current | Optimized | Improvement |
+|-------------|---------|-----------|-------------|
+| Intent Classification | 5-10ms | 1-3ms | **70% faster** |
+| Search (small dataset) | 50-100ms | 10-30ms | **70% faster** |
+| Search (large dataset) | 200-500ms | 50-150ms | **70% faster** |
+| LLM Generation (cached) | 1-5s | 0.01-0.1s | **99% faster** |
+| LLM Generation (uncached) | 1-5s | 0.8-4s | **20% faster** |
+| Total Response (cached) | 100-500ms | 10-50ms | **90% faster** |
+| Total Response (uncached) | 1-6s | 0.5-3s | **50% faster** |
+## 11. Implementation Priority
+### Phase 1: Quick Wins (1-2 days)
+1. ✅ Add response caching (Django cache)
+2. ✅ Pre-compile keyword patterns
+3. ✅ Cache synonyms
+4. ✅ Add database indexes
+5. ✅ Early exit for exact matches
+### Phase 2: Medium Impact (3-5 days)
+1. ✅ Limit QuerySet before loading
+2. ✅ Optimize TF-IDF calculation
+3. ✅ Prompt caching for LLM
+4. ✅ Optimize local model generation
+5. ✅ Add performance logging
+### Phase 3: Advanced (1-2 weeks)
+1. ✅ Streaming responses
+2. ✅ Incremental TF-IDF
+3. ✅ Advanced caching strategies
+4. ✅ Query result pre-computation
+## 12. Testing Performance
+```python
+# backend/scripts/benchmark_chatbot.py
+import time
+import statistics
+def benchmark_chatbot():
+    chatbot = get_chatbot()
+    test_queries = [
+        "Mức phạt vượt đèn đỏ là bao nhiêu?",
+        "Thủ tục đăng ký cư trú cần gì?",
+        "Địa chỉ công an phường ở đâu?",
+        # ... more queries
+    ]
+    times = []
+    for query in test_queries:
+        start = time.time()
+        response = chatbot.generate_response(query)
+        elapsed = time.time() - start
+        times.append(elapsed)
+        print(f"Query: {query[:50]}... | Time: {elapsed:.3f}s")
+    print(f"\nAverage: {statistics.mean(times):.3f}s")
+    print(f"Median: {statistics.median(times):.3f}s")
+    print(f"P95: {statistics.quantiles(times, n=20)[18]:.3f}s")
+```
+## Kết luận
+Với các tối ưu trên, chatbot sẽ:
+- **Nhanh hơn 50-90%** cho cached queries
+- **Nhanh hơn 20-70%** cho uncached queries
+- **Chính xác hơn** với early exit và exact matching
+- **Scalable hơn** với database indexes và query limiting

backend/TEST_API_MODE.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# Hướng dẫn Test API Mode
+## Vấn đề hiện tại
+- HF Spaces không nhận được request từ project local
+- Response vẫn là template-based (không phải từ LLM)
+## Đã sửa
+1. ✅ API mode giờ gửi `prompt` (có documents) thay vì chỉ `query`
+2. ✅ Đã thêm logging chi tiết: `[LLM] 🔗 Calling API`, `[RAG] Using LLM provider`
+## Cách test
+### 1. Fix database error (nếu cần)
+```bash
+# Kiểm tra PostgreSQL có đang chạy không
+psql -h localhost -p 5543 -U hue -d hue_portal
+# Hoặc dùng SQLite tạm thời (sửa settings.py)
+```
+### 2. Start server với env đúng
+```bash
+cd /Users/davidtran/Downloads/TryHarDemNayProject/backend
+source venv/bin/activate
+cd hue_portal
+# Kiểm tra env
+cat ../.env | grep LLM
+# Start server
+python3 manage.py runserver 0.0.0.0:8000
+```
+### 3. Test API mode
+```bash
+# Test với câu hỏi có documents
+curl -X POST http://localhost:8000/api/chatbot/chat/ \
+  -H "Content-Type: application/json" \
+  -d '{"message": "Mức phạt vượt đèn đỏ là bao nhiêu?", "reset_session": false}'
+```
+### 4. Xem server logs
+Tìm các logs sau:
+- `[RAG] Using LLM provider: api` - LLM được gọi
+- `[LLM] 🔗 Calling API: https://davidtran999-hue-portal-backend.hf.space/api/chatbot/chat/` - Đang gọi HF Spaces
+- `[LLM] 📥 Response status: 200` - HF Spaces trả về response
+- `[LLM] ✅ Got message from API` - Nhận được message từ API
+Nếu KHÔNG thấy logs này:
+- LLM không được gọi (check `use_llm=True`)
+- LLM generation fail (xem error logs)
+- LLM not available (check `get_llm_generator()`)
+## Debug checklist
+- [ ] Server start thành công (không có database error)
+- [ ] `.env` có `LLM_PROVIDER=api` và `HF_API_BASE_URL=...`
+- [ ] Server load đúng env (restart sau khi sửa `.env`)
+- [ ] Test với câu hỏi có documents (không phải greeting)
+- [ ] Xem server logs để tìm `[LLM]` và `[RAG]` logs
+- [ ] Kiểm tra HF Spaces có đang chạy không
+## Nếu vẫn không hoạt động
+1. **Kiểm tra LLM có được gọi không:**
+   - Xem logs `[RAG] Using LLM provider: api`
+   - Nếu không có, check `use_llm=True` trong `rag_pipeline()`
+2. **Kiểm tra API call:**
+   - Xem logs `[LLM] 🔗 Calling API: ...`
+   - Nếu không có, check `_generate_api()` có được gọi không
+3. **Kiểm tra response:**
+   - Xem logs `[LLM] 📥 Response status: ...`
+   - Nếu 200, check response content
+   - Nếu error, xem error message
+4. **Test trực tiếp API:**
+```bash
+curl -X POST https://davidtran999-hue-portal-backend.hf.space/api/chatbot/chat/ \
+  -H "Content-Type: application/json" \
+  -d '{"message": "Test", "reset_session": false}'
+```

backend/WHY_LLM_NOT_CALLED.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# Tại sao LLM không được gọi?
+## Vấn đề
+Chatbot đã trả lời được, nhưng response là **template-based** (không phải từ LLM API).
+## Nguyên nhân
+### 1. Không có documents được tìm thấy
+- Response cho thấy: `"count": 0`, `"results": []`
+- Database chưa có tables hoặc chưa có dữ liệu
+### 2. LLM chỉ được gọi khi CÓ documents
+Trong `rag.py`:
+```python
+# Try LLM generation first if enabled and documents are available
+if use_llm and documents:  # ← Cần có documents
+    llm = get_llm_generator()
+    if llm:
+        llm_answer = llm.generate_answer(...)
+```
+**Logic:**
+- Nếu **KHÔNG có documents** → Trả về template message ngay lập tức
+- Nếu **CÓ documents** → Gọi LLM để generate answer
+## Giải pháp
+### 1. Chạy migrations để tạo tables
+```bash
+cd backend && source venv/bin/activate && cd hue_portal
+python3 manage.py makemigrations
+python3 manage.py migrate
+```
+### 2. Import/Ingest dữ liệu vào database
+- Cần có dữ liệu về fines, procedures, legal sections, etc.
+- Sau khi có dữ liệu, search sẽ tìm thấy documents
+- Khi có documents, LLM sẽ được gọi
+### 3. Test với câu hỏi có documents
+- Nếu database đã có dữ liệu, test với câu hỏi chắc chắn có trong DB
+- Ví dụ: "Mức phạt vượt đèn đỏ" (nếu có dữ liệu về fines)
+## Flow hoạt động
+1. **User gửi câu hỏi** → `chatbot/views.py`
+2. **Intent classification** → Xác định loại câu hỏi
+3. **RAG pipeline** → Tìm documents trong database
+   - Nếu **KHÔNG có documents** → Trả về template message
+   - Nếu **CÓ documents** → Gọi LLM để generate answer
+4. **LLM generation** (chỉ khi có documents):
+   - `get_llm_generator()` → Lấy LLM instance
+   - `llm.generate_answer(query, documents=documents)` → Generate
+   - Với API mode: Gọi HF Spaces API với prompt (có documents)
+5. **Response** → Trả về cho user
+## Để test API mode
+1. **Đảm bảo database có dữ liệu**
+2. **Gửi câu hỏi có documents** (ví dụ: "Mức phạt vượt đèn đỏ")
+3. **Xem server logs** để thấy:
+   - `[RAG] Using LLM provider: api`
+   - `[LLM] 🔗 Calling API: ...`
+   - `[LLM] 📥 Response status: 200`
+## Lưu ý
+- **API mode đã được cấu hình đúng** (`LLM_PROVIDER=api`)
+- **Code đã sửa để gửi prompt (có documents)** thay vì chỉ query
+- **Vấn đề hiện tại:** Database chưa có dữ liệu → Không có documents → LLM không được gọi

backend/chuyenapichatbot.py CHANGED Viewed

File without changes

backend/docs/API_ENDPOINTS.md ADDED Viewed

	@@ -0,0 +1,152 @@

+# Chatbot API Endpoints
+## Overview
+This document describes the chatbot API endpoints available in the system.
+## Base URL
+- Default: `http://localhost:8000`
+- Override via env when running test scripts:
+  ```bash
+  export API_BASE_URL=http://localhost:8090  # e.g. when runserver uses port 8090
+  ```
+## Endpoints
+### 1. Health Check
+**Endpoint**: `GET /api/chatbot/health/`
+**Description**: Check the health status of the chatbot service.
+**Response**:
+```json
+{
+  "status": "healthy",
+  "service": "chatbot",
+  "classifier_loaded": true
+}
+```
+**Example**:
+```bash
+curl http://localhost:8000/api/chatbot/health/
+```
+### 2. Chat
+**Endpoint**: `POST /api/chat/`
+**Description**: Send a message to the chatbot and get a response.
+**Request Body**:
+```json
+{
+  "message": "Làm thủ tục cư trú cần gì?"
+}
+```
+**Response**:
+```json
+{
+  "message": "Tôi tìm thấy 5 thủ tục liên quan đến 'Làm thủ tục cư trú cần gì?':\n\n1. Đăng ký thường trú\n   ...",
+  "intent": "search_procedure",
+  "confidence": 0.95,
+  "results": [
+    {
+      "type": "procedure",
+      "data": {
+        "id": 1,
+        "title": "Đăng ký thường trú",
+        "domain": "Cư trú",
+        ...
+      }
+    }
+  ],
+  "count": 5
+}
+```
+**Example**:
+```bash
+curl -X POST http://localhost:8000/api/chat/ \
+  -H "Content-Type: application/json" \
+  -d '{"message": "Làm thủ tục cư trú cần gì?"}'
+```
+## Intent Types
+The chatbot can classify queries into the following intents:
+- `search_fine`: Search for traffic fines
+- `search_procedure`: Search for administrative procedures
+- `search_office`: Search for office/unit information
+- `search_advisory`: Search for security advisories
+- `general_query`: General queries
+- `greeting`: Greetings
+## Response Fields
+- `message`: The response message to display to the user
+- `intent`: The classified intent
+- `confidence`: Confidence score (0.0 to 1.0)
+- `results`: Array of search results
+- `count`: Number of results found
+## Error Handling
+### 400 Bad Request
+```json
+{
+  "error": "message is required"
+}
+```
+### 500 Internal Server Error
+```json
+{
+  "message": "Xin lỗi, có lỗi xảy ra. Vui lòng thử lại.",
+  "intent": "error",
+  "error": "Error details",
+  "results": [],
+  "count": 0
+}
+```
+## Testing
+Use the provided test script:
+```bash
+cd backend
+API_BASE_URL=http://localhost:8090 \\
+POSTGRES_HOST=localhost POSTGRES_PORT=5433 \\
+python scripts/test_api_endpoint.py
+```
+The script automatically:
+- Hits `GET /api/chatbot/health/` to confirm classifier loading.
+- Sends six representative queries and reports status, intent, confidence, latency, and first result title.
+## API Endpoint Testing & Fixes — 2025-11-14
+- Added trailing slashes to `backend/hue_portal/chatbot/urls.py` and `backend/hue_portal/core/urls.py` so `/api/chatbot/health/` and `/api/chat/` resolve correctly.
+- Hardened chatbot serialization via `_serialize_document` to avoid `TypeError: Object of type type is not JSON serializable`.
+- Latest test run:
+  - Command: `API_BASE_URL=http://localhost:8090 POSTGRES_HOST=localhost POSTGRES_PORT=5433 python scripts/test_api_endpoint.py`
+  - Result: **6/6** successful queries, **100 % intent accuracy**, avg latency **~3.7 s** (first call includes SentenceTransformer warm-up).
+- Checklist before running tests:
+  1. `POSTGRES_HOST=localhost POSTGRES_PORT=5433 ../../.venv/bin/python manage.py runserver 0.0.0.0:8090`
+  2. Ensure `API_BASE_URL` matches runserver port.
+  3. (Optional) export `DJANGO_DEBUG=1` for verbose stack traces during local debugging.
+## Notes
+- The API uses RAG (Retrieval-Augmented Generation) pipeline for generating responses
+- Hybrid search (BM25 + Vector similarity) is used for retrieval
+- Intent classification uses ML model with keyword-based fallback
+- Response latency typically ranges from 200-1000ms depending on query complexity

backend/docs/INTENT_CLASSIFICATION_IMPROVEMENTS.md ADDED Viewed

	@@ -0,0 +1,87 @@

+# Intent Classification Improvements
+## Overview
+This document describes the improvements made to intent classification in Plan 5.
+## Problem Identified
+Query "Cảnh báo lừa đảo giả danh công an" was being classified as `search_office` instead of `search_advisory`.
+### Root Cause
+1. **Keyword Conflict**: The keyword "công an" appears in both `search_office` and queries about `search_advisory`
+2. **Order of Checks**: The code checked `has_office_keywords` before `has_advisory_keywords`, causing office keywords to match first
+3. **Limited Training Data**: The `search_advisory` intent had only 7 examples, compared to more examples in other intents
+## Solutions Implemented
+### 1. Improved Keyword Matching Logic
+**File**: `backend/hue_portal/chatbot/chatbot.py`
+- Changed order: Check `has_advisory_keywords` **before** `has_office_keywords`
+- Added more keywords for advisory: "mạo danh", "thủ đoạn", "cảnh giác"
+- This ensures advisory queries are matched first when they contain both advisory and office keywords
+### 2. Enhanced Training Data
+**File**: `backend/hue_portal/chatbot/training/intent_dataset.json`
+- Expanded `search_advisory` examples from 7 to 23 examples
+- Added specific examples:
+  - "cảnh báo lừa đảo giả danh công an"
+  - "mạo danh cán bộ công an"
+  - "lừa đảo mạo danh"
+  - And 15 more variations
+### 3. Retrained Model
+- Retrained intent classification model with improved training data
+- Model accuracy improved
+- Better handling of edge cases
+## Results
+### Before Improvements
+- Query "Cảnh báo lừa đảo giả danh công an" → `search_office` (incorrect)
+- Limited training examples for `search_advisory`
+### After Improvements
+- Query "Cảnh báo lừa đảo giả danh công an" → `search_advisory` (correct)
+- More balanced training data across all intents
+- Better keyword matching logic
+## Testing
+Test queries that now work correctly:
+- "Cảnh báo lừa đảo giả danh công an" → `search_advisory`
+- "Lừa đảo mạo danh cán bộ" → `search_advisory`
+- "Mạo danh cán bộ công an" → `search_advisory`
+## 2025-11-14 Update — Serialization & API Regression
+- Added `_serialize_document` in `backend/hue_portal/chatbot/chatbot.py` so RAG responses return JSON-safe payloads (no more `TypeError: Object of type type is not JSON serializable` when embeddings include model instances).
+- Re-tested intents end-to-end via `scripts/test_api_endpoint.py` (6 queries spanning all intents):
+  - **Result:** 6/6 passed, 100 % intent accuracy.
+  - **Latency:** avg ~3.7 s (note: first call warms up `keepitreal/vietnamese-sbert-v2`, subsequent calls ≤1.8 s).
+- Health checklist before testing:
+  1. `POSTGRES_HOST=localhost POSTGRES_PORT=5433 ../../.venv/bin/python manage.py runserver 0.0.0.0:8090`
+  2. `API_BASE_URL=http://localhost:8090 python scripts/test_api_endpoint.py`
+  3. Watch server logs for any serialization warnings (none observed after fix).
+## Files Modified
+1. `backend/hue_portal/chatbot/training/intent_dataset.json` - Enhanced training data
+2. `backend/hue_portal/chatbot/chatbot.py` - Improved keyword matching logic
+3. `backend/hue_portal/chatbot/training/artifacts/intent_model.joblib` - Retrained model
+## Future Improvements
+- Continue to add more training examples as edge cases are discovered
+- Consider using more sophisticated ML models (e.g., transformer-based)
+- Implement active learning to automatically improve from user feedback

backend/docs/LEGAL_REFRESH.md ADDED Viewed

	@@ -0,0 +1,55 @@

+# Legal Data Refresh Workflow
+Use this sequence whenever new DOCX/PDF files are imported outside the user-facing UI (e.g. nightly ETL or bulk manifests).
+## Prerequisites
+- Postgres + Redis running.
+- Celery worker online (for interactive uploads) or `CELERY_TASK_ALWAYS_EAGER=true` for synchronous runs.
+- Tesseract OCR installed (see `OCR_SETUP.md`).
+## Manual Command Sequence
+```
+cd backend/hue_portal
+source ../.venv/bin/activate
+python manage.py load_legal_document --file "/path/to/docx" --code DOC-123
+python ../scripts/generate_embeddings.py --model legal
+python ../scripts/build_faiss_index.py --model legal
+```
+Notes:
+- `load_legal_document` can be substituted with the manifest loader (`scripts/load_legal_documents.py`) if multiple files need ingestion.
+- The embedding script logs processed sections; expect a SHA checksum for each chunk.
+- FAISS builder writes artifacts under `backend/hue_portal/artifacts/faiss_indexes`.
+## Automated Helper
+`backend/scripts/refresh_legal_data.sh` wraps the three steps:
+```
+./backend/scripts/refresh_legal_data.sh \
+  --file "/path/to/THONG-TU.docx" \
+  --code TT-02
+```
+Flags:
+- `--skip-ingest` to only regenerate embeddings/index (useful after editing chunking logic).
+- `--python` to point at a specific interpreter (default `python3`).
+## CI / Nightly Jobs
+1. Sync new files into `tài nguyên/`.
+2. Run the helper script for each file (or call the manifest loader first).
+3. Archive FAISS artifacts (upload to object storage) so the chatbot containers can download them at boot.
+4. Record build duration and artifact checksums for auditing.
+## Verification Checklist
+- `generate_embeddings` log ends with `Completed model=legal`.
+- FAISS directory contains fresh timestamped `.faiss` + `.mappings.pkl`.
+- Sample chatbot query (“Thông tư 02 ...”) returns snippets referencing the newly ingested document.

backend/docs/OCR_SETUP.md ADDED Viewed

	@@ -0,0 +1,56 @@

+# Tesseract OCR Runtime Setup
+PyMuPDF + `pytesseract` require the native **tesseract-ocr** binary (with Vietnamese language data) to extract text from scanned PDFs. Install it on every environment that runs ingestion or Celery workers.
+## Docker / CI (Debian-based)
+The backend Dockerfile already installs the required packages:
+```bash
+apt-get update && apt-get install -y \
+  tesseract-ocr \
+  tesseract-ocr-eng \
+  tesseract-ocr-vie
+```
+For GitHub Actions or other CI images, run the same command before executing tests that touch OCR.
+## macOS (Homebrew)
+```bash
+brew install tesseract
+brew install tesseract-lang # optional (contains vie)
+```
+Verify:
+```bash
+tesseract --version
+ls /opt/homebrew/Cellar/tesseract/*/share/tessdata/vie.traineddata
+```
+## Ubuntu / Debian
+```bash
+sudo apt update
+sudo apt install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-vie
+```
+## Rocky / CentOS (DNF)
+```bash
+sudo dnf install -y tesseract tesseract-langpack-eng tesseract-langpack-vie
+```
+## Configuration
+- Set `OCR_LANGS` (default `vie+eng`) if additional language combinations are needed.
+- `OCR_PDF_ZOOM` (default `2.0`) controls rasterization DPI; increase for very small fonts.
+- Check that `tesseract` is in `$PATH` for the user running Django/Celery.
+## Troubleshooting
+1. Run `tesseract --list-langs` to confirm Vietnamese appears.
+2. Ensure the worker container/user has read access to `/usr/share/tesseract-ocr/4.00/tessdata`.
+3. If OCR still fails, set `CELERY_TASK_ALWAYS_EAGER=true` locally to debug synchronously and inspect logs for `pytesseract` errors.

backend/golden_queries_example.json ADDED Viewed

	@@ -0,0 +1,68 @@

+[
+  {
+    "query": "Mức phạt vượt đèn đỏ là bao nhiêu?",
+    "intent": "search_fine",
+    "response_message": "Mức phạt vượt đèn đỏ theo Nghị định 100/2019/NĐ-CP là từ 200.000 - 400.000 VNĐ, tùy thuộc vào mức độ vi phạm.",
+    "response_data": {
+      "message": "Mức phạt vượt đèn đỏ theo Nghị định 100/2019/NĐ-CP là từ 200.000 - 400.000 VNĐ, tùy thuộc vào mức độ vi phạm.",
+      "intent": "search_fine",
+      "confidence": 0.95,
+      "results": [
+        {
+          "type": "fine",
+          "data": {
+            "id": 1,
+            "name": "Vượt đèn đỏ",
+            "code": "V001",
+            "min_fine": 200000,
+            "max_fine": 400000,
+            "article": "Điều 5",
+            "decree": "Nghị định 100/2019/NĐ-CP"
+          }
+        }
+      ],
+      "count": 1
+    },
+    "verified_by": "legal_expert",
+    "accuracy_score": 1.0
+  },
+  {
+    "query": "Thủ tục đăng ký tạm trú cần những gì?",
+    "intent": "search_procedure",
+    "response_message": "Thủ tục đăng ký tạm trú cần các giấy tờ sau: CMND/CCCD, giấy tờ chứng minh nơi ở, đơn đăng ký tạm trú. Nộp tại Công an phường/xã nơi tạm trú.",
+    "response_data": {
+      "message": "Thủ tục đăng ký tạm trú cần các giấy tờ sau: CMND/CCCD, giấy tờ chứng minh nơi ở, đơn đăng ký tạm trú. Nộp tại Công an phường/xã nơi tạm trú.",
+      "intent": "search_procedure",
+      "confidence": 0.95,
+      "results": [
+        {
+          "type": "procedure",
+          "data": {
+            "id": 1,
+            "title": "Đăng ký tạm trú",
+            "domain": "Cư trú",
+            "level": "Phường/Xã"
+          }
+        }
+      ],
+      "count": 1
+    },
+    "verified_by": "legal_expert",
+    "accuracy_score": 1.0
+  },
+  {
+    "query": "Địa chỉ công an phường ở đâu?",
+    "intent": "search_office",
+    "response_message": "Địa chỉ công an phường tùy thuộc vào phường bạn đang ở. Bạn có thể tra cứu tại trang web hoặc liên hệ số điện thoại 0234.xxx.xxx để được hướng dẫn.",
+    "response_data": {
+      "message": "Địa chỉ công an phường tùy thuộc vào phường bạn đang ở. Bạn có thể tra cứu tại trang web hoặc liên hệ số điện thoại 0234.xxx.xxx để được hướng dẫn.",
+      "intent": "search_office",
+      "confidence": 0.95,
+      "results": [],
+      "count": 0
+    },
+    "verified_by": "manual",
+    "accuracy_score": 1.0
+  }
+]

backend/hue_portal/Procfile ADDED Viewed

File without changes

backend/hue_portal/chatbot/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+Chatbot app for handling conversational queries and natural language processing.
+"""

backend/hue_portal/chatbot/advanced_features.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Advanced features for chatbot: follow-up suggestions, ambiguity detection, explanations.
+"""
+from typing import List, Dict, Any, Optional
+from hue_portal.core.models import Fine, Procedure, Office, Advisory
+def suggest_follow_up_questions(query: str, results: List[Any], intent: str) -> List[str]:
+    """
+    Suggest follow-up questions based on query and results.
+    Args:
+        query: Original query.
+        results: Retrieved results.
+        intent: Detected intent.
+    Returns:
+        List of suggested follow-up questions.
+    """
+    suggestions = []
+    if intent == "search_fine":
+        if results:
+            # Suggest questions about related fines
+            suggestions.append("Còn mức phạt nào khác không?")
+            suggestions.append("Điều luật liên quan là gì?")
+            suggestions.append("Biện pháp khắc phục như thế nào?")
+        else:
+            suggestions.append("Bạn có thể cho biết cụ thể loại vi phạm không?")
+    elif intent == "search_procedure":
+        if results:
+            suggestions.append("Hồ sơ cần chuẩn bị gì?")
+            suggestions.append("Lệ phí là bao nhiêu?")
+            suggestions.append("Thời hạn xử lý là bao lâu?")
+            suggestions.append("Nộp hồ sơ ở đâu?")
+        else:
+            suggestions.append("Bạn muốn tìm thủ tục nào cụ thể?")
+    elif intent == "search_office":
+        if results:
+            suggestions.append("Số điện thoại liên hệ?")
+            suggestions.append("Giờ làm việc như thế nào?")
+            suggestions.append("Địa chỉ cụ thể ở đâu?")
+        else:
+            suggestions.append("Bạn muốn tìm đơn vị nào?")
+    elif intent == "search_advisory":
+        if results:
+            suggestions.append("Còn cảnh báo nào khác không?")
+            suggestions.append("Cách phòng tránh như thế nào?")
+        else:
+            suggestions.append("Bạn muốn tìm cảnh báo về chủ đề gì?")
+    return suggestions[:3]  # Return top 3 suggestions
+def detect_ambiguity(query: str, results_count: int, confidence: float) -> Tuple[bool, Optional[str]]:
+    """
+    Detect if query is ambiguous.
+    Args:
+        query: User query.
+        results_count: Number of results found.
+        confidence: Confidence score.
+    Returns:
+        Tuple of (is_ambiguous, ambiguity_reason).
+    """
+    query_lower = query.lower()
+    query_words = query.split()
+    # Very short queries are often ambiguous
+    if len(query_words) <= 2:
+        return (True, "Câu hỏi quá ngắn, cần thêm thông tin")
+    # Low confidence and many results suggests ambiguity
+    if results_count > 10 and confidence < 0.5:
+        return (True, "Kết quả quá nhiều, cần cụ thể hơn")
+    # Very generic queries
+    generic_queries = ["thông tin", "tìm kiếm", "hỏi", "giúp"]
+    if any(gq in query_lower for gq in generic_queries) and len(query_words) <= 3:
+        return (True, "Câu hỏi chung chung, cần cụ thể hơn")
+    return (False, None)
+def generate_explanation(result: Any, query: str, score: Optional[float] = None) -> str:
+    """
+    Generate explanation for why a result is relevant.
+    Args:
+        result: Result object.
+        result_type: Type of result.
+        query: Original query.
+        score: Relevance score.
+    Returns:
+        Explanation string.
+    """
+    result_type = type(result).__name__.lower()
+    explanation_parts = []
+    if "fine" in result_type:
+        name = getattr(result, "name", "")
+        code = getattr(result, "code", "")
+        explanation_parts.append(f"Kết quả này phù hợp vì:")
+        if code:
+            explanation_parts.append(f"- Mã vi phạm: {code}")
+        if name:
+            explanation_parts.append(f"- Tên vi phạm: {name}")
+        if score:
+            explanation_parts.append(f"- Độ phù hợp: {score:.0%}")
+    elif "procedure" in result_type:
+        title = getattr(result, "title", "")
+        explanation_parts.append(f"Kết quả này phù hợp vì:")
+        if title:
+            explanation_parts.append(f"- Tên thủ tục: {title}")
+        if score:
+            explanation_parts.append(f"- Độ phù hợp: {score:.0%}")
+    elif "office" in result_type:
+        unit_name = getattr(result, "unit_name", "")
+        explanation_parts.append(f"Kết quả này phù hợp vì:")
+        if unit_name:
+            explanation_parts.append(f"- Tên đơn vị: {unit_name}")
+        if score:
+            explanation_parts.append(f"- Độ phù hợp: {score:.0%}")
+    elif "advisory" in result_type:
+        title = getattr(result, "title", "")
+        explanation_parts.append(f"Kết quả này phù hợp vì:")
+        if title:
+            explanation_parts.append(f"- Tiêu đề: {title}")
+        if score:
+            explanation_parts.append(f"- Độ phù hợp: {score:.0%}")
+    return "\n".join(explanation_parts) if explanation_parts else "Kết quả này phù hợp với câu hỏi của bạn."
+def compare_results(results: List[Any], result_type: str) -> str:
+    """
+    Compare multiple results and highlight differences.
+    Args:
+        results: List of result objects.
+        result_type: Type of results.
+    Returns:
+        Comparison summary string.
+    """
+    if len(results) < 2:
+        return ""
+    comparison_parts = ["So sánh các kết quả:"]
+    if result_type == "fine":
+        # Compare fine amounts
+        fine_amounts = []
+        for result in results[:3]:
+            if hasattr(result, "min_fine") and hasattr(result, "max_fine"):
+                if result.min_fine and result.max_fine:
+                    fine_amounts.append(f"{result.name}: {result.min_fine:,.0f} - {result.max_fine:,.0f} VNĐ")
+        if fine_amounts:
+            comparison_parts.extend(fine_amounts)
+    elif result_type == "procedure":
+        # Compare procedures by domain/level
+        for result in results[:3]:
+            title = getattr(result, "title", "")
+            domain = getattr(result, "domain", "")
+            level = getattr(result, "level", "")
+            if title:
+                comp = f"- {title}"
+                if domain:
+                    comp += f" ({domain})"
+                if level:
+                    comp += f" - Cấp {level}"
+                comparison_parts.append(comp)
+    return "\n".join(comparison_parts)

backend/hue_portal/chatbot/analytics.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+Analytics and monitoring for Dual-Path RAG routing.
+"""
+from datetime import datetime, timedelta
+from typing import Dict, Any, List
+from django.db.models import Count, Avg, Q, F
+from django.utils import timezone
+from hue_portal.core.models import QueryRoutingLog, GoldenQuery
+def get_routing_stats(days: int = 7) -> Dict[str, Any]:
+    """
+    Get routing statistics for the last N days.
+    Args:
+        days: Number of days to analyze (default: 7).
+    Returns:
+        Dictionary with routing statistics.
+    """
+    cutoff_date = timezone.now() - timedelta(days=days)
+    logs = QueryRoutingLog.objects.filter(created_at__gte=cutoff_date)
+    total_count = logs.count()
+    if total_count == 0:
+        return {
+            'total_queries': 0,
+            'fast_path_count': 0,
+            'slow_path_count': 0,
+            'fast_path_percentage': 0.0,
+            'slow_path_percentage': 0.0,
+            'fast_path_avg_time_ms': 0.0,
+            'slow_path_avg_time_ms': 0.0,
+            'router_methods': {},
+            'intent_breakdown': {},
+            'cache_hit_rate': 0.0,
+            'top_golden_queries': [],
+        }
+    # Path statistics
+    fast_path_count = logs.filter(route='fast_path').count()
+    slow_path_count = logs.filter(route='slow_path').count()
+    # Average response times
+    fast_path_avg = logs.filter(route='fast_path').aggregate(
+        avg_time=Avg('response_time_ms')
+    )['avg_time'] or 0.0
+    slow_path_avg = logs.filter(route='slow_path').aggregate(
+        avg_time=Avg('response_time_ms')
+    )['avg_time'] or 0.0
+    # Router methods breakdown
+    router_methods = dict(
+        logs.values('router_method')
+        .annotate(count=Count('id'))
+        .values_list('router_method', 'count')
+    )
+    # Intent breakdown
+    intent_breakdown = dict(
+        logs.values('intent')
+        .annotate(count=Count('id'))
+        .values_list('intent', 'count')
+    )
+    # Cache hit rate (Fast Path usage)
+    cache_hit_rate = (fast_path_count / total_count * 100) if total_count > 0 else 0.0
+    # Top golden queries by usage
+    top_golden_queries = list(
+        GoldenQuery.objects.filter(is_active=True)
+        .order_by('-usage_count')[:10]
+        .values('id', 'query', 'intent', 'usage_count', 'accuracy_score')
+    )
+    return {
+        'total_queries': total_count,
+        'fast_path_count': fast_path_count,
+        'slow_path_count': slow_path_count,
+        'fast_path_percentage': (fast_path_count / total_count * 100) if total_count > 0 else 0.0,
+        'slow_path_percentage': (slow_path_count / total_count * 100) if total_count > 0 else 0.0,
+        'fast_path_avg_time_ms': round(fast_path_avg, 2),
+        'slow_path_avg_time_ms': round(slow_path_avg, 2),
+        'router_methods': router_methods,
+        'intent_breakdown': intent_breakdown,
+        'cache_hit_rate': round(cache_hit_rate, 2),
+        'top_golden_queries': top_golden_queries,
+        'period_days': days,
+    }
+def get_golden_dataset_stats() -> Dict[str, Any]:
+    """
+    Get statistics about the golden dataset.
+    Returns:
+        Dictionary with golden dataset statistics.
+    """
+    total_queries = GoldenQuery.objects.count()
+    active_queries = GoldenQuery.objects.filter(is_active=True).count()
+    # Intent breakdown
+    intent_breakdown = dict(
+        GoldenQuery.objects.filter(is_active=True)
+        .values('intent')
+        .annotate(count=Count('id'))
+        .values_list('intent', 'count')
+    )
+    # Total usage
+    total_usage = GoldenQuery.objects.aggregate(
+        total_usage=Count('usage_count')
+    )['total_usage'] or 0
+    # Average accuracy
+    avg_accuracy = GoldenQuery.objects.filter(is_active=True).aggregate(
+        avg_accuracy=Avg('accuracy_score')
+    )['avg_accuracy'] or 1.0
+    # Queries with embeddings
+    with_embeddings = GoldenQuery.objects.filter(
+        is_active=True,
+        query_embedding__isnull=False
+    ).count()
+    return {
+        'total_queries': total_queries,
+        'active_queries': active_queries,
+        'intent_breakdown': intent_breakdown,
+        'total_usage': total_usage,
+        'avg_accuracy': round(avg_accuracy, 3),
+        'with_embeddings': with_embeddings,
+        'embedding_coverage': (with_embeddings / active_queries * 100) if active_queries > 0 else 0.0,
+    }
+def get_performance_metrics(days: int = 7) -> Dict[str, Any]:
+    """
+    Get performance metrics for both paths.
+    Args:
+        days: Number of days to analyze.
+    Returns:
+        Dictionary with performance metrics.
+    """
+    cutoff_date = timezone.now() - timedelta(days=days)
+    logs = QueryRoutingLog.objects.filter(created_at__gte=cutoff_date)
+    # P95, P99 response times
+    fast_path_times = list(
+        logs.filter(route='fast_path')
+        .values_list('response_time_ms', flat=True)
+        .order_by('response_time_ms')
+    )
+    slow_path_times = list(
+        logs.filter(route='slow_path')
+        .values_list('response_time_ms', flat=True)
+        .order_by('response_time_ms')
+    )
+    def percentile(data: List[float], p: float) -> float:
+        """Calculate percentile of sorted data."""
+        if not data:
+            return 0.0
+        if len(data) == 1:
+            return data[0]
+        k = (len(data) - 1) * p
+        f = int(k)
+        c = k - f
+        if f + 1 < len(data):
+            return float(data[f] + c * (data[f + 1] - data[f]))
+        return float(data[-1])
+    return {
+        'fast_path': {
+            'p50': percentile(fast_path_times, 0.5),
+            'p95': percentile(fast_path_times, 0.95),
+            'p99': percentile(fast_path_times, 0.99),
+            'min': min(fast_path_times) if fast_path_times else 0.0,
+            'max': max(fast_path_times) if fast_path_times else 0.0,
+        },
+        'slow_path': {
+            'p50': percentile(slow_path_times, 0.5),
+            'p95': percentile(slow_path_times, 0.95),
+            'p99': percentile(slow_path_times, 0.99),
+            'min': min(slow_path_times) if slow_path_times else 0.0,
+            'max': max(slow_path_times) if slow_path_times else 0.0,
+        },
+    }

backend/hue_portal/chatbot/apps.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from django.apps import AppConfig
+class ChatbotConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'hue_portal.chatbot'

backend/hue_portal/chatbot/cache_monitor.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""
+Monitor Hugging Face model cache directory to track download progress.
+This is a simpler approach that monitors the cache directory size.
+"""
+import os
+import time
+import threading
+from pathlib import Path
+from typing import Dict, Optional
+from dataclasses import dataclass, field
+@dataclass
+class CacheProgress:
+    """Track cache directory size progress."""
+    model_path: str
+    cache_path: Optional[str] = None
+    total_size_bytes: int = 0
+    current_size_bytes: int = 0
+    files_count: int = 0
+    files_completed: int = 0
+    last_updated: float = 0.0
+    is_monitoring: bool = False
+    @property
+    def percentage(self) -> float:
+        """Calculate progress percentage."""
+        if self.total_size_bytes == 0:
+            # Estimate based on typical model sizes
+            if "32B" in self.model_path or "32b" in self.model_path:
+                estimated_size = 70 * 1024 * 1024 * 1024  # ~70GB for 32B
+            elif "7B" in self.model_path or "7b" in self.model_path:
+                estimated_size = 15 * 1024 * 1024 * 1024  # ~15GB for 7B
+            else:
+                estimated_size = 5 * 1024 * 1024 * 1024  # ~5GB default
+            return min(100.0, (self.current_size_bytes / estimated_size) * 100.0)
+        return min(100.0, (self.current_size_bytes / self.total_size_bytes) * 100.0)
+    @property
+    def size_gb(self) -> float:
+        """Get current size in GB."""
+        return self.current_size_bytes / (1024 ** 3)
+    @property
+    def total_size_gb(self) -> float:
+        """Get total size in GB."""
+        if self.total_size_bytes == 0:
+            # Estimate
+            if "32B" in self.model_path or "32b" in self.model_path:
+                return 70.0
+            elif "7B" in self.model_path or "7b" in self.model_path:
+                return 15.0
+            else:
+                return 5.0
+        return self.total_size_bytes / (1024 ** 3)
+    def to_dict(self) -> Dict:
+        """Convert to dictionary."""
+        return {
+            "model_path": self.model_path,
+            "cache_path": self.cache_path,
+            "current_size_bytes": self.current_size_bytes,
+            "current_size_gb": round(self.size_gb, 2),
+            "total_size_bytes": self.total_size_bytes,
+            "total_size_gb": round(self.total_size_gb, 2),
+            "percentage": round(self.percentage, 2),
+            "files_count": self.files_count,
+            "files_completed": self.files_completed,
+            "is_monitoring": self.is_monitoring,
+            "last_updated": self.last_updated
+        }
+class CacheMonitor:
+    """Monitor cache directory for download progress."""
+    def __init__(self):
+        self._progress: Dict[str, CacheProgress] = {}
+        self._lock = threading.Lock()
+        self._monitoring_threads: Dict[str, threading.Thread] = {}
+    def get_or_create(self, model_path: str) -> CacheProgress:
+        """Get or create progress tracker."""
+        with self._lock:
+            if model_path not in self._progress:
+                self._progress[model_path] = CacheProgress(model_path=model_path)
+            return self._progress[model_path]
+    def get(self, model_path: str) -> Optional[CacheProgress]:
+        """Get progress tracker."""
+        with self._lock:
+            return self._progress.get(model_path)
+    def _get_cache_path(self, model_path: str) -> Optional[Path]:
+        """Get cache path for model."""
+        try:
+            cache_dir = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
+            repo_id = model_path.replace("/", "--")
+            cache_path = Path(cache_dir) / "hub" / f"models--{repo_id}"
+            return cache_path if cache_path.exists() else None
+        except Exception:
+            return None
+    def _monitor_cache(self, model_path: str, interval: float = 2.0):
+        """Monitor cache directory size."""
+        progress = self.get_or_create(model_path)
+        progress.is_monitoring = True
+        cache_path = self._get_cache_path(model_path)
+        if cache_path:
+            progress.cache_path = str(cache_path)
+        while progress.is_monitoring:
+            try:
+                if cache_path and cache_path.exists():
+                    # Calculate current size
+                    total_size = 0
+                    file_count = 0
+                    for file_path in cache_path.rglob("*"):
+                        if file_path.is_file():
+                            file_count += 1
+                            total_size += file_path.stat().st_size
+                    progress.current_size_bytes = total_size
+                    progress.files_count = file_count
+                    progress.last_updated = time.time()
+                    # Check for key files to determine completion
+                    key_files = ["config.json", "tokenizer.json", "model.safetensors", "pytorch_model.bin"]
+                    found_files = []
+                    for key_file in key_files:
+                        if list(cache_path.rglob(key_file)):
+                            found_files.append(key_file)
+                    progress.files_completed = len(found_files)
+                    # Estimate total size if not set
+                    if progress.total_size_bytes == 0 and progress.files_completed == len(key_files):
+                        # All key files found, use current size as total
+                        progress.total_size_bytes = total_size
+                else:
+                    # Cache doesn't exist yet, check if it was created
+                    cache_path = self._get_cache_path(model_path)
+                    if cache_path:
+                        progress.cache_path = str(cache_path)
+                time.sleep(interval)
+            except Exception as e:
+                logger.error(f"Error monitoring cache: {e}")
+                time.sleep(interval)
+    def start_monitoring(self, model_path: str, interval: float = 2.0):
+        """Start monitoring cache directory."""
+        with self._lock:
+            if model_path not in self._monitoring_threads:
+                thread = threading.Thread(
+                    target=self._monitor_cache,
+                    args=(model_path, interval),
+                    daemon=True
+                )
+                thread.start()
+                self._monitoring_threads[model_path] = thread
+    def stop_monitoring(self, model_path: str):
+        """Stop monitoring cache directory."""
+        with self._lock:
+            progress = self._progress.get(model_path)
+            if progress:
+                progress.is_monitoring = False
+            if model_path in self._monitoring_threads:
+                del self._monitoring_threads[model_path]
+    def get_progress(self, model_path: str) -> Optional[Dict]:
+        """Get progress as dictionary."""
+        progress = self.get(model_path)
+        if progress:
+            return progress.to_dict()
+        return None
+# Global monitor instance
+_global_monitor = CacheMonitor()
+def get_cache_monitor() -> CacheMonitor:
+    """Get global cache monitor instance."""
+    return _global_monitor
+# Import logger
+import logging
+logger = logging.getLogger(__name__)

backend/hue_portal/chatbot/chatbot.py ADDED Viewed

	@@ -0,0 +1,1092 @@

+"""
+Chatbot wrapper that integrates core chatbot with router, LLM, and context management.
+"""
+import os
+import copy
+import logging
+import json
+import time
+import unicodedata
+import re
+from typing import Dict, Any, Optional
+from hue_portal.core.chatbot import Chatbot as CoreChatbot, get_chatbot as get_core_chatbot
+from hue_portal.chatbot.router import decide_route, IntentRoute, RouteDecision, DOCUMENT_CODE_PATTERNS
+from hue_portal.chatbot.context_manager import ConversationContext
+from hue_portal.chatbot.llm_integration import LLMGenerator
+from hue_portal.core.models import LegalSection, LegalDocument
+from hue_portal.chatbot.exact_match_cache import ExactMatchCache
+from hue_portal.chatbot.slow_path_handler import SlowPathHandler
+logger = logging.getLogger(__name__)
+EXACT_MATCH_CACHE = ExactMatchCache(
+    max_size=int(os.environ.get("EXACT_MATCH_CACHE_MAX", "256")),
+    ttl_seconds=int(os.environ.get("EXACT_MATCH_CACHE_TTL_SECONDS", "43200")),
+)
+DEBUG_LOG_PATH = "/Users/davidtran/Downloads/TryHarDemNayProject/.cursor/debug.log"
+DEBUG_SESSION_ID = "debug-session"
+DEBUG_RUN_ID = "pre-fix"
+#region agent log
+def _agent_debug_log(hypothesis_id: str, location: str, message: str, data: Dict[str, Any]):
+    try:
+        payload = {
+            "sessionId": DEBUG_SESSION_ID,
+            "runId": DEBUG_RUN_ID,
+            "hypothesisId": hypothesis_id,
+            "location": location,
+            "message": message,
+            "data": data,
+            "timestamp": int(time.time() * 1000),
+        }
+        with open(DEBUG_LOG_PATH, "a", encoding="utf-8") as log_file:
+            log_file.write(json.dumps(payload, ensure_ascii=False) + "\n")
+    except Exception:
+        pass
+#endregion
+class Chatbot(CoreChatbot):
+    """
+    Enhanced chatbot with session support, routing, and RAG capabilities.
+    """
+    def __init__(self):
+        super().__init__()
+        self.llm_generator = None
+        # Cache in-memory: giữ câu trả lời legal gần nhất theo session để xử lý follow-up nhanh
+        self._last_legal_answer_by_session: Dict[str, str] = {}
+        self._initialize_llm()
+    def _initialize_llm(self):
+        """Initialize LLM generator if needed."""
+        try:
+            self.llm_generator = LLMGenerator()
+        except Exception as e:
+            print(f"⚠️ LLM generator not available: {e}")
+            self.llm_generator = None
+    def generate_response(self, query: str, session_id: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Generate chatbot response with session support and routing.
+        Args:
+            query: User query string
+            session_id: Optional session ID for conversation context
+        Returns:
+            Response dictionary with message, intent, results, etc.
+        """
+        query = query.strip()
+        # Save user message to context
+        if session_id:
+            try:
+                ConversationContext.add_message(
+                    session_id=session_id,
+                    role="user",
+                    content=query
+                )
+            except Exception as e:
+                print(f"⚠️ Failed to save user message: {e}")
+        session_metadata: Dict[str, Any] = {}
+        selected_doc_code: Optional[str] = None
+        if session_id:
+            try:
+                session_metadata = ConversationContext.get_session_metadata(session_id)
+                selected_doc_code = session_metadata.get("selected_document_code")
+            except Exception:
+                session_metadata = {}
+        # Classify intent
+        intent, confidence = self.classify_intent(query)
+        # Router decision (using raw intent)
+        route_decision = decide_route(query, intent, confidence)
+        # Use forced intent if router suggests it
+        if route_decision.forced_intent:
+            intent = route_decision.forced_intent
+        # Nếu session đã có selected_document_code (user đã chọn văn bản ở wizard)
+        # thì luôn ép intent về search_legal và route sang SEARCH,
+        # tránh bị kẹt ở nhánh small-talk/off-topic do nội dung câu hỏi ban đầu.
+        if selected_doc_code:
+            intent = "search_legal"
+            route_decision.route = IntentRoute.SEARCH
+            route_decision.forced_intent = "search_legal"
+        # Map tất cả intent tra cứu nội dung về search_legal
+        domain_search_intents = {
+            "search_fine",
+            "search_procedure",
+            "search_office",
+            "search_advisory",
+            "general_query",
+        }
+        if intent in domain_search_intents:
+            intent = "search_legal"
+            route_decision.route = IntentRoute.SEARCH
+            route_decision.forced_intent = "search_legal"
+        # Instant exact-match cache lookup
+        # ⚠️ Tắt cache cho intent search_legal để luôn đi qua wizard / Slow Path,
+        # tránh trả lại các câu trả lời cũ không có options.
+        cached_response = None
+        if intent != "search_legal":
+            cached_response = EXACT_MATCH_CACHE.get(query, intent)
+        if cached_response:
+            cached_response["_cache"] = "exact_match"
+            cached_response["_source"] = cached_response.get("_source", "cache")
+            cached_response.setdefault("routing", route_decision.route.value)
+            logger.info(
+                "[CACHE] Hit for intent=%s route=%s source=%s",
+                intent,
+                route_decision.route.value,
+                cached_response["_source"],
+            )
+            if session_id:
+                cached_response["session_id"] = session_id
+            if session_id:
+                try:
+                    ConversationContext.add_message(
+                        session_id=session_id,
+                        role="bot",
+                        content=cached_response.get("message", ""),
+                        intent=intent,
+                    )
+                except Exception as e:
+                    print(f"⚠️ Failed to save cached bot message: {e}")
+            return cached_response
+        # Wizard / option-first ngay tại chatbot layer:
+        # Multi-stage wizard flow:
+        # Stage 1: Choose document (if no document selected)
+        # Stage 2: Choose topic/section (if document selected but no topic)
+        # Stage 3: Choose detail (if topic selected, ask for more details)
+        # Final: Answer (when user says "Không" or after detail selection)
+        disable_wizard_flow = os.environ.get("DISABLE_WIZARD_FLOW", "false").lower() == "true"
+        print(f"[WIZARD] DISABLE_WIZARD_FLOW={os.environ.get('DISABLE_WIZARD_FLOW', 'false')} -> disable_wizard_flow={disable_wizard_flow}")
+        has_doc_code_in_query = self._query_has_document_code(query)
+        wizard_stage = session_metadata.get("wizard_stage") if session_metadata else None
+        selected_topic = session_metadata.get("selected_topic") if session_metadata else None
+        wizard_depth = session_metadata.get("wizard_depth", 0) if session_metadata else 0
+        print(f"[WIZARD] Chatbot layer check - intent={intent}, wizard_stage={wizard_stage}, selected_doc_code={selected_doc_code}, selected_topic={selected_topic}, has_doc_code_in_query={has_doc_code_in_query}, query='{query[:50]}'")
+        # CRITICAL: If wizard flow is disabled, reset all wizard state immediately
+        if disable_wizard_flow:
+            print("[WIZARD] 🚫 Wizard flow DISABLED - resetting all wizard state and skipping wizard stages")
+            selected_doc_code = None
+            selected_topic = None
+            wizard_stage = None
+            wizard_depth = 0
+            # Update session metadata to clear wizard state
+            if session_id:
+                try:
+                    ConversationContext.update_session_metadata(
+                        session_id,
+                        {
+                            "selected_document_code": None,
+                            "selected_topic": None,
+                            "wizard_stage": None,
+                            "wizard_depth": 0,
+                        }
+                    )
+                    print("[WIZARD] ✅ Wizard state cleared from session metadata")
+                except Exception as e:
+                    print(f"⚠️ Failed to clear wizard state: {e}")
+            # Also update session_metadata dict for current function scope
+            if session_metadata:
+                session_metadata["selected_document_code"] = None
+                session_metadata["selected_topic"] = None
+                session_metadata["wizard_stage"] = None
+                session_metadata["wizard_depth"] = 0
+        # Reset wizard state if new query doesn't have document code and wizard_stage is "answer"
+        # This handles the case where user asks a new question after completing a previous wizard flow
+        # CRITICAL: Check conditions and reset BEFORE Stage 1 check
+        should_reset = (
+            not disable_wizard_flow
+            and intent == "search_legal"
+            and not has_doc_code_in_query
+            and wizard_stage == "answer"
+        )
+        print(f"[WIZARD] Reset check - intent={intent}, has_doc_code={has_doc_code_in_query}, wizard_stage={wizard_stage}, should_reset={should_reset}")  # v2.0-fix
+        if should_reset:
+            print("[WIZARD] 🔄 New query detected, resetting wizard state for fresh start")
+            selected_doc_code = None
+            selected_topic = None
+            wizard_stage = None
+            # Update session metadata FIRST before continuing
+            if session_id:
+                try:
+                    ConversationContext.update_session_metadata(
+                        session_id,
+                        {
+                            "selected_document_code": None,
+                            "selected_topic": None,
+                            "wizard_stage": None,
+                            "wizard_depth": 0,
+                        }
+                    )
+                    print("[WIZARD] ✅ Wizard state reset in session metadata")
+                except Exception as e:
+                    print(f"⚠️ Failed to reset wizard state: {e}")
+            # Also update session_metadata dict for current function scope
+            if session_metadata:
+                session_metadata["selected_document_code"] = None
+                session_metadata["selected_topic"] = None
+                session_metadata["wizard_stage"] = None
+                session_metadata["wizard_depth"] = 0
+        # Stage 1: Choose document (if no document selected and no code in query)
+        # Use Query Rewrite Strategy from slow_path_handler instead of old LLM suggestions
+        if (
+            intent == "search_legal"
+            and not selected_doc_code
+            and not has_doc_code_in_query
+            and not disable_wizard_flow
+        ):
+            print("[WIZARD] ✅ Stage 1: Using Query Rewrite Strategy from slow_path_handler")
+            # Delegate to slow_path_handler which has Query Rewrite Strategy
+            slow_handler = SlowPathHandler()
+            response = slow_handler.handle(
+                query=query,
+                intent=intent,
+                session_id=session_id,
+                selected_document_code=None,  # No document selected yet
+            )
+            # Ensure response has wizard metadata
+            if response:
+                response.setdefault("wizard_stage", "choose_document")
+                response.setdefault("routing", "legal_wizard")
+                response.setdefault("type", "options")
+                # Update session metadata
+                if session_id:
+                    try:
+                        ConversationContext.update_session_metadata(
+                            session_id,
+                            {
+                                "wizard_stage": "choose_document",
+                                "wizard_depth": 1,
+                            }
+                        )
+                    except Exception as e:
+                        logger.warning("[WIZARD] Failed to update session metadata: %s", e)
+                # Save bot message to context
+                if session_id:
+                    try:
+                        bot_message = response.get("message") or response.get("clarification", {}).get("message", "")
+                        ConversationContext.add_message(
+                            session_id=session_id,
+                            role="bot",
+                            content=bot_message,
+                            intent=intent,
+                        )
+                    except Exception as e:
+                        print(f"⚠️ Failed to save wizard bot message: {e}")
+            return response if response else {
+                "message": "Xin lỗi, có lỗi xảy ra khi tìm kiếm văn bản.",
+                "intent": intent,
+                "results": [],
+                "count": 0,
+            }
+        # Stage 2: Choose topic/section (if document selected but no topic yet)
+        # Skip if wizard_stage is already "answer" (user wants final answer)
+        if (
+            intent == "search_legal"
+            and selected_doc_code
+            and not selected_topic
+            and not has_doc_code_in_query
+            and wizard_stage != "answer"
+            and not disable_wizard_flow
+        ):
+            print("[WIZARD] ✅ Stage 2 triggered: Choose topic/section")
+            # Get document title
+            document_title = selected_doc_code
+            try:
+                doc = LegalDocument.objects.filter(code=selected_doc_code).first()
+                if doc:
+                    document_title = getattr(doc, "title", "") or selected_doc_code
+            except Exception:
+                pass
+            # Extract keywords from query for parallel search
+            search_keywords_from_query = []
+            if self.llm_generator:
+                try:
+                    conversation_context = None
+                    if session_id:
+                        try:
+                            recent_messages = ConversationContext.get_recent_messages(session_id, limit=5)
+                            conversation_context = [
+                                {"role": msg.role, "content": msg.content}
+                                for msg in recent_messages
+                            ]
+                        except Exception:
+                            pass
+                    search_keywords_from_query = self.llm_generator.extract_search_keywords(
+                        query=query,
+                        selected_options=None,  # No options selected yet
+                        conversation_context=conversation_context,
+                    )
+                    print(f"[WIZARD] Extracted keywords: {search_keywords_from_query[:5]}")
+                except Exception as exc:
+                    logger.warning("[WIZARD] Keyword extraction failed: %s", exc)
+            # Fallback to simple keyword extraction
+            if not search_keywords_from_query:
+                search_keywords_from_query = self.chatbot.extract_keywords(query)
+            # Trigger parallel search for document (if not already done)
+            slow_handler = SlowPathHandler()
+            prefetched_results = slow_handler._get_prefetched_results(session_id, "document_results")
+            if not prefetched_results:
+                # Trigger parallel search now
+                slow_handler._parallel_search_prepare(
+                    document_code=selected_doc_code,
+                    keywords=search_keywords_from_query,
+                    session_id=session_id,
+                )
+                logger.info("[WIZARD] Triggered parallel search for document")
+            # Get prefetched search results from parallel search (if available)
+            prefetched_results = slow_handler._get_prefetched_results(session_id, "document_results")
+            search_results = []
+            if prefetched_results:
+                search_results = prefetched_results.get("results", [])
+                logger.info("[WIZARD] Using prefetched results: %d sections", len(search_results))
+            else:
+                # Fallback: search synchronously if prefetch not ready
+                search_result = slow_handler._search_by_intent(
+                    intent="search_legal",
+                    query=query,
+                    limit=20,
+                    preferred_document_code=selected_doc_code.upper(),
+                )
+                search_results = search_result.get("results", [])
+                logger.info("[WIZARD] Fallback search: %d sections", len(search_results))
+            # Extract keywords for topic options
+            conversation_context = None
+            if session_id:
+                try:
+                    recent_messages = ConversationContext.get_recent_messages(session_id, limit=5)
+                    conversation_context = [
+                        {"role": msg.role, "content": msg.content}
+                        for msg in recent_messages
+                    ]
+                except Exception:
+                    pass
+            # Use LLM to generate topic options
+            topic_options = []
+            intro_message = f"Bạn muốn tìm điều khoản/chủ đề nào cụ thể trong {document_title}?"
+            search_keywords = []
+            if self.llm_generator:
+                try:
+                    llm_payload = self.llm_generator.suggest_topic_options(
+                        query=query,
+                        document_code=selected_doc_code,
+                        document_title=document_title,
+                        search_results=search_results[:10],  # Top 10 for options
+                        conversation_context=conversation_context,
+                        max_options=3,
+                    )
+                    if llm_payload:
+                        intro_message = llm_payload.get("message") or intro_message
+                        topic_options = llm_payload.get("options", [])
+                        search_keywords = llm_payload.get("search_keywords", [])
+                        print(f"[WIZARD] ✅ LLM generated {len(topic_options)} topic options")
+                except Exception as exc:
+                    logger.warning("[WIZARD] LLM topic suggestion failed: %s", exc)
+            # Fallback: build options from search results
+            if not topic_options and search_results:
+                for result in search_results[:3]:
+                    data = result.get("data", {})
+                    section_title = data.get("section_title") or data.get("title") or ""
+                    article = data.get("article") or data.get("article_number") or ""
+                    if section_title or article:
+                        topic_options.append({
+                            "title": section_title or article,
+                            "article": article,
+                            "reason": data.get("excerpt", "")[:100] or "",
+                            "keywords": [],
+                        })
+            # If still no options, create generic ones
+            if not topic_options:
+                topic_options = [
+                    {
+                        "title": "Các điều khoản liên quan",
+                        "article": "",
+                        "reason": "Tìm kiếm các điều khoản liên quan đến câu hỏi của bạn",
+                        "keywords": [],
+                    }
+                ]
+            # Trigger parallel search for selected keywords
+            if search_keywords:
+                slow_handler._parallel_search_topic(
+                    document_code=selected_doc_code,
+                    topic_keywords=search_keywords,
+                    session_id=session_id,
+                )
+            response = {
+                "message": intro_message,
+                "intent": intent,
+                "confidence": confidence,
+                "results": [],
+                "count": 0,
+                "routing": "legal_wizard",
+                "type": "options",
+                "wizard_stage": "choose_topic",
+                "clarification": {
+                    "message": intro_message,
+                    "options": topic_options,
+                },
+                "options": topic_options,
+            }
+            if session_id:
+                response["session_id"] = session_id
+                try:
+                    ConversationContext.add_message(
+                        session_id=session_id,
+                        role="bot",
+                        content=intro_message,
+                        intent=intent,
+                    )
+                    ConversationContext.update_session_metadata(
+                        session_id,
+                        {
+                            "wizard_stage": "choose_topic",
+                        },
+                    )
+                except Exception as e:
+                    print(f"⚠️ Failed to save Stage 2 bot message: {e}")
+            return response
+        # Stage 3: Choose detail (if topic selected, ask if user wants more details)
+        # Skip if wizard_stage is already "answer" (user wants final answer)
+        if intent == "search_legal" and selected_doc_code and selected_topic and wizard_stage != "answer":
+            # Check if user is asking for more details or saying "Không"
+            query_lower = query.lower()
+            wants_more = any(kw in query_lower for kw in ["có", "cần", "muốn", "thêm", "chi tiết", "nữa"])
+            says_no = any(kw in query_lower for kw in ["không", "khong", "thôi", "đủ", "xong"])
+            if says_no or wizard_depth >= 2:
+                # User doesn't want more details or already asked twice - proceed to final answer
+                print("[WIZARD] ✅ User wants final answer, proceeding to slow_path")
+                # Clear wizard stage to allow normal answer flow
+                if session_id:
+                    try:
+                        ConversationContext.update_session_metadata(
+                            session_id,
+                            {
+                                "wizard_stage": "answer",
+                            },
+                        )
+                    except Exception:
+                        pass
+            elif wants_more or wizard_depth == 0:
+                # User wants more details - generate detail options
+                print("[WIZARD] ✅ Stage 3 triggered: Choose detail")
+                # Get conversation context
+                conversation_context = None
+                if session_id:
+                    try:
+                        recent_messages = ConversationContext.get_recent_messages(session_id, limit=5)
+                        conversation_context = [
+                            {"role": msg.role, "content": msg.content}
+                            for msg in recent_messages
+                        ]
+                    except Exception:
+                        pass
+                # Use LLM to generate detail options
+                detail_options = []
+                intro_message = "Bạn muốn chi tiết gì cho chủ đề này nữa không?"
+                search_keywords = []
+                if self.llm_generator:
+                    try:
+                        llm_payload = self.llm_generator.suggest_detail_options(
+                            query=query,
+                            selected_document_code=selected_doc_code,
+                            selected_topic=selected_topic,
+                            conversation_context=conversation_context,
+                            max_options=3,
+                        )
+                        if llm_payload:
+                            intro_message = llm_payload.get("message") or intro_message
+                            detail_options = llm_payload.get("options", [])
+                            search_keywords = llm_payload.get("search_keywords", [])
+                            print(f"[WIZARD] ✅ LLM generated {len(detail_options)} detail options")
+                    except Exception as exc:
+                        logger.warning("[WIZARD] LLM detail suggestion failed: %s", exc)
+                # Fallback options
+                if not detail_options:
+                    detail_options = [
+                        {
+                            "title": "Thẩm quyền xử lý",
+                            "reason": "Tìm hiểu về thẩm quyền xử lý kỷ luật",
+                            "keywords": ["thẩm quyền", "xử lý"],
+                        },
+                        {
+                            "title": "Trình tự, thủ tục",
+                            "reason": "Tìm hiểu về trình tự, thủ tục xử lý",
+                            "keywords": ["trình tự", "thủ tục"],
+                        },
+                        {
+                            "title": "Hình thức kỷ luật",
+                            "reason": "Tìm hiểu về các hình thức kỷ luật",
+                            "keywords": ["hình thức", "kỷ luật"],
+                        },
+                    ]
+                # Trigger parallel search for detail keywords
+                if search_keywords and session_id:
+                    slow_handler = SlowPathHandler()
+                    slow_handler._parallel_search_topic(
+                        document_code=selected_doc_code,
+                        topic_keywords=search_keywords,
+                        session_id=session_id,
+                    )
+                response = {
+                    "message": intro_message,
+                    "intent": intent,
+                    "confidence": confidence,
+                    "results": [],
+                    "count": 0,
+                    "routing": "legal_wizard",
+                    "type": "options",
+                    "wizard_stage": "choose_detail",
+                    "clarification": {
+                        "message": intro_message,
+                        "options": detail_options,
+                    },
+                    "options": detail_options,
+                }
+                if session_id:
+                    response["session_id"] = session_id
+                    try:
+                        ConversationContext.add_message(
+                            session_id=session_id,
+                            role="bot",
+                            content=intro_message,
+                            intent=intent,
+                        )
+                        ConversationContext.update_session_metadata(
+                            session_id,
+                            {
+                                "wizard_stage": "choose_detail",
+                                "wizard_depth": wizard_depth + 1,
+                            },
+                        )
+                    except Exception as e:
+                        print(f"⚠️ Failed to save Stage 3 bot message: {e}")
+                return response
+        # Always send legal intent through Slow Path RAG
+        if intent == "search_legal":
+            response = self._run_slow_path_legal(
+                query,
+                intent,
+                session_id,
+                route_decision,
+                session_metadata=session_metadata,
+            )
+        elif route_decision.route == IntentRoute.GREETING:
+            response = {
+                "message": "Xin chào! Tôi có thể giúp bạn tra cứu các thông tin liên quan về các văn bản quy định pháp luật về xử lí kỷ luật cán bộ đảng viên",
+                "intent": "greeting",
+                "confidence": 0.9,
+                "results": [],
+                "count": 0,
+                "routing": "greeting"
+            }
+        elif route_decision.route == IntentRoute.SMALL_TALK:
+            # Xử lý follow-up questions trong context
+            follow_up_keywords = [
+                "có điều khoản",
+                "liên quan",
+                "khác",
+                "nữa",
+                "thêm",
+                "tóm tắt",
+                "tải file",
+                "tải",
+                "download",
+            ]
+            query_lower = query.lower()
+            is_follow_up = any(kw in query_lower for kw in follow_up_keywords)
+            #region agent log
+            _agent_debug_log(
+                hypothesis_id="H2",
+                location="chatbot.py:119",
+                message="follow_up_detection",
+                data={
+                    "query": query,
+                    "is_follow_up": is_follow_up,
+                    "session_id_present": bool(session_id),
+                },
+            )
+            #endregion
+            response = None
+            # Nếu là follow-up question, ưu tiên dùng context legal gần nhất trong session
+            if is_follow_up and session_id:
+                previous_answer = self._last_legal_answer_by_session.get(session_id, "")
+                # Nếu chưa có trong cache in-memory, fallback sang ConversationContext DB
+                if not previous_answer:
+                    try:
+                        recent_messages = ConversationContext.get_recent_messages(session_id, limit=5)
+                        for msg in reversed(recent_messages):
+                            if msg.role == "bot" and msg.intent == "search_legal":
+                                previous_answer = msg.content or ""
+                                break
+                    except Exception as e:
+                        logger.warning("[FOLLOW_UP] Failed to load context from DB: %s", e)
+                if previous_answer:
+                    if "tóm tắt" in query_lower:
+                        summary_message = None
+                        if getattr(self, "llm_generator", None):
+                            try:
+                                prompt = (
+                                    "Bạn là chuyên gia pháp luật. Hãy tóm tắt ngắn gọn, rõ ràng nội dung chính của đoạn sau "
+                                    "(giữ nguyên tinh thần và các mức, tỷ lệ, hình thức kỷ luật nếu có):\n\n"
+                                    f"{previous_answer}"
+                                )
+                                summary_message = self.llm_generator.generate_answer(
+                                    prompt,
+                                    context=None,
+                                    documents=None,
+                                )
+                            except Exception as e:
+                                logger.warning("[FOLLOW_UP] LLM summary failed: %s", e)
+                        if summary_message:
+                            message = summary_message
+                        else:
+                            content_preview = (
+                                previous_answer[:400] + "..." if len(previous_answer) > 400 else previous_answer
+                            )
+                            message = "Tóm tắt nội dung chính của điều khoản trước đó:\n\n" f"{content_preview}"
+                    elif "tải" in query_lower:
+                        message = (
+                            "Bạn có thể tải file gốc của văn bản tại mục Quản lý văn bản trên hệ thống "
+                            "hoặc liên hệ cán bộ phụ trách để được cung cấp bản đầy đủ."
+                        )
+                    else:
+                        message = (
+                            "Trong câu trả lời trước, tôi đã trích dẫn điều khoản chính liên quan. "
+                            "Nếu bạn cần điều khoản khác (ví dụ về thẩm quyền, trình tự, hồ sơ), "
+                            "hãy nêu rõ nội dung muốn tìm để tôi trợ giúp nhanh nhất."
+                        )
+                    response = {
+                        "message": message,
+                        "intent": "search_legal",
+                        "confidence": 0.85,
+                        "results": [],
+                        "count": 0,
+                        "routing": "follow_up",
+                    }
+            # Nếu không phải follow-up hoặc không tìm thấy context, trả về message thân thiện
+            if response is None:
+                #region agent log
+                _agent_debug_log(
+                    hypothesis_id="H1",
+                    location="chatbot.py:193",
+                    message="follow_up_fallback",
+                    data={
+                        "is_follow_up": is_follow_up,
+                        "session_id_present": bool(session_id),
+                    },
+                )
+                #endregion
+                # Detect off-topic questions (nấu ăn, chả trứng, etc.)
+                off_topic_keywords = ["nấu", "nau", "chả trứng", "cha trung", "món ăn", "mon an", "công thức", "cong thuc",
+                                     "cách làm", "cach lam", "đổ chả", "do cha", "trứng", "trung"]
+                is_off_topic = any(kw in query_lower for kw in off_topic_keywords)
+                if is_off_topic:
+                    # Ngoài phạm vi → từ chối lịch sự + gợi ý wizard với các văn bản pháp lý chính
+                    intro_message = (
+                        "Xin lỗi, tôi là chatbot chuyên về tra cứu các văn bản quy định pháp luật "
+                        "về xử lí kỷ luật cán bộ đảng viên của Phòng Thanh Tra - Công An Thành Phố Huế.\n\n"
+                        "Tôi không thể trả lời các câu hỏi về nấu ăn, công thức nấu ăn hay các chủ đề khác ngoài phạm vi pháp luật.\n\n"
+                        "Tuy nhiên, tôi có thể giúp bạn tra cứu một số văn bản pháp luật quan trọng. "
+                        "Bạn hãy chọn văn bản muốn xem trước:"
+                    )
+                    clarification_options = [
+                        {
+                            "code": "264-QD-TW",
+                            "title": "Quyết định 264-QĐ/TW về kỷ luật đảng viên",
+                            "reason": "Quy định chung về xử lý kỷ luật đối với đảng viên vi phạm.",
+                        },
+                        {
+                            "code": "QD-69-TW",
+                            "title": "Quy định 69-QĐ/TW về kỷ luật tổ chức đảng, đảng viên",
+                            "reason": "Quy định chi tiết về các hành vi vi phạm và hình thức kỷ luật.",
+                        },
+                        {
+                            "code": "TT-02-CAND",
+                            "title": "Thông tư 02/2021/TT-BCA về điều lệnh CAND",
+                            "reason": "Quy định về điều lệnh, lễ tiết, tác phong trong CAND.",
+                        },
+                        {
+                            "code": "__other__",
+                            "title": "Khác",
+                            "reason": "Tôi muốn hỏi văn bản hoặc chủ đề pháp luật khác.",
+                        },
+                    ]
+                    response = {
+                        "message": intro_message,
+                        "intent": intent,
+                        "confidence": confidence,
+                        "results": [],
+                        "count": 0,
+                        "routing": "small_talk_offtopic_wizard",
+                        "type": "options",
+                        "wizard_stage": "choose_document",
+                        "clarification": {
+                            "message": intro_message,
+                            "options": clarification_options,
+                        },
+                        "options": clarification_options,
+                    }
+                else:
+                    message = (
+                        "Tôi có thể giúp bạn tra cứu các văn bản quy định pháp luật về xử lí kỷ luật cán bộ đảng viên. "
+                        "Bạn muốn tìm gì?"
+                    )
+                response = {
+                    "message": message,
+                    "intent": intent,
+                    "confidence": confidence,
+                    "results": [],
+                    "count": 0,
+                        "routing": "small_talk",
+                }
+        else:  # IntentRoute.SEARCH
+            # Use core chatbot search for other intents
+                search_result = self.search_by_intent(intent, query, limit=5)
+                # Generate response message
+                if search_result["count"] > 0:
+                    template = self._get_response_template(intent)
+                    message = template.format(
+                        count=search_result["count"],
+                        query=query
+                    )
+                else:
+                    message = f"Xin lỗi, tôi không tìm thấy thông tin liên quan đến '{query}'. Vui lòng thử lại với từ khóa khác."
+                response = {
+                    "message": message,
+                    "intent": intent,
+                    "confidence": confidence,
+                    "results": search_result["results"],
+                    "count": search_result["count"],
+                    "routing": "search"
+                }
+        if session_id and intent == "search_legal":
+            try:
+                self._last_legal_answer_by_session[session_id] = response.get("message", "") or ""
+            except Exception:
+                pass
+        # Đánh dấu loại payload cho frontend: answer hay options (wizard)
+        if response.get("clarification") or response.get("type") == "options":
+            response.setdefault("type", "options")
+        else:
+            response.setdefault("type", "answer")
+        # Add session_id
+        if session_id:
+            response["session_id"] = session_id
+        # Save bot response to context
+        if session_id:
+            try:
+                bot_message = response.get("message") or response.get("clarification", {}).get("message", "")
+                ConversationContext.add_message(
+                    session_id=session_id,
+                    role="bot",
+                    content=bot_message,
+                    intent=intent
+                )
+            except Exception as e:
+                print(f"⚠️ Failed to save bot message: {e}")
+        self._cache_response(query, intent, response)
+        return response
+    def _run_slow_path_legal(
+        self,
+        query: str,
+        intent: str,
+        session_id: Optional[str],
+        route_decision: RouteDecision,
+        session_metadata: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """Execute Slow Path legal handler (with fast-path + structured output)."""
+        slow_handler = SlowPathHandler()
+        selected_doc_code = None
+        if session_metadata:
+            selected_doc_code = session_metadata.get("selected_document_code")
+        response = slow_handler.handle(
+            query,
+            intent,
+            session_id,
+            selected_document_code=selected_doc_code,
+        )
+        response.setdefault("routing", "slow_path")
+        response.setdefault(
+            "_routing",
+            {
+                "path": "slow_path",
+                "method": getattr(route_decision, "rationale", "router"),
+                "confidence": route_decision.confidence,
+            },
+        )
+        # Cập nhật metadata wizard đơn giản: nếu đang hỏi người dùng chọn văn bản
+        # thì đánh dấu stage = choose_document; nếu đã trả lời thì stage = answer.
+        if session_id:
+            try:
+                if response.get("clarification") or response.get("type") == "options":
+                    ConversationContext.update_session_metadata(
+                        session_id,
+                        {
+                            "wizard_stage": "choose_document",
+                        },
+                    )
+                else:
+                    ConversationContext.update_session_metadata(
+                        session_id,
+                        {
+                            "wizard_stage": "answer",
+                            "last_answer_type": response.get("intent"),
+                        },
+                    )
+            except Exception:
+                # Không để lỗi metadata làm hỏng luồng trả lời chính
+                pass
+        logger.info(
+            "[LEGAL] Slow path response - source=%s count=%s routing=%s",
+            response.get("_source"),
+            response.get("count"),
+            response.get("_routing"),
+        )
+        return response
+    def _cache_response(self, query: str, intent: str, response: Dict[str, Any]) -> None:
+        """Store response in exact-match cache if eligible."""
+        if not self._should_cache_response(intent, response):
+            logger.debug(
+                "[CACHE] Skip storing response (intent=%s, results=%s)",
+                intent,
+                response.get("count"),
+            )
+            return
+        payload = copy.deepcopy(response)
+        payload.pop("session_id", None)
+        payload.pop("_cache", None)
+        EXACT_MATCH_CACHE.set(query, intent, payload)
+        logger.info(
+            "[CACHE] Stored response for intent=%s (results=%s, source=%s)",
+            intent,
+            response.get("count"),
+            response.get("_source"),
+        )
+    def _should_cache_response(self, intent: str, response: Dict[str, Any]) -> bool:
+        """Determine if response should be cached for exact matches."""
+        if response.get("clarification"):
+            return False
+        cacheable_intents = {
+            "search_legal",
+            "search_fine",
+            "search_procedure",
+            "search_office",
+            "search_advisory",
+        }
+        if intent not in cacheable_intents:
+            return False
+        if response.get("count", 0) <= 0:
+            return False
+        if not response.get("results"):
+            return False
+        return True
+    def _query_has_document_code(self, query: str) -> bool:
+        """
+        Check if the raw query string explicitly contains a known document code pattern
+        (ví dụ: '264/QĐ-TW', 'QD-69-TW', 'TT-02-CAND').
+        """
+        if not query:
+            return False
+        # Remove accents để regex đơn giản hơn
+        normalized = unicodedata.normalize("NFD", query)
+        normalized = "".join(ch for ch in normalized if unicodedata.category(ch) != "Mn")
+        normalized = normalized.upper()
+        for pattern in DOCUMENT_CODE_PATTERNS:
+            try:
+                if re.search(pattern, normalized):
+                    return True
+            except re.error:
+                continue
+        return False
+    def _handle_legal_query(self, query: str, session_id: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Handle legal document queries with RAG pipeline.
+        Args:
+            query: User query
+            session_id: Optional session ID
+        Returns:
+            Response dictionary
+        """
+        # Search legal sections
+        qs = LegalSection.objects.select_related("document").all()
+        text_fields = ["section_title", "section_code", "content"]
+        legal_sections = self._search_legal_sections(qs, query, text_fields, top_k=5)
+        if not legal_sections:
+            return {
+                "message": f"Xin lỗi, tôi không tìm thấy văn bản pháp luật liên quan đến '{query}'.",
+                "intent": "search_legal",
+                "confidence": 0.5,
+                "results": [],
+                "count": 0,
+                "routing": "search"
+            }
+        # Try LLM generation if available
+        if self.llm_generator and self.llm_generator.provider != "none":
+            try:
+                answer = self.llm_generator.generate_structured_legal_answer(
+                    query=query,
+                    documents=legal_sections,
+                    max_attempts=2
+                )
+                message = answer.summary
+            except Exception as e:
+                print(f"⚠️ LLM generation failed: {e}")
+                message = self._format_legal_results(legal_sections, query)
+        else:
+            # Template-based response
+            message = self._format_legal_results(legal_sections, query)
+        # Format results
+        results = []
+        for section in legal_sections:
+            doc = section.document
+            results.append({
+                "type": "legal",
+                "data": {
+                    "id": section.id,
+                    "section_code": section.section_code,
+                    "section_title": section.section_title or "",
+                    "content": section.content[:500] + "..." if len(section.content) > 500 else section.content,
+                    "excerpt": section.excerpt or "",
+                    "document_code": doc.code if doc else "",
+                    "document_title": doc.title if doc else "",
+                    "page_start": section.page_start,
+                    "page_end": section.page_end,
+                    "download_url": f"/api/legal-documents/{doc.id}/download/" if doc and doc.id else None,
+                    "source_url": doc.source_url if doc else ""
+                }
+            })
+        return {
+            "message": message,
+            "intent": "search_legal",
+            "confidence": 0.9,
+            "results": results,
+            "count": len(results),
+            "routing": "search"
+        }
+    def _search_legal_sections(self, qs, query: str, text_fields: list, top_k: int = 5):
+        """Search legal sections using ML search."""
+        from hue_portal.core.search_ml import search_with_ml
+        return search_with_ml(qs, query, text_fields, top_k=top_k, min_score=0.1)
+    def _format_legal_results(self, sections, query: str) -> str:
+        """Format legal sections into response message."""
+        if not sections:
+            return f"Xin lỗi, tôi không tìm thấy văn bản pháp luật liên quan đến '{query}'."
+        doc = sections[0].document
+        doc_info = f"{doc.code}: {doc.title}" if doc else "Văn bản pháp luật"
+        message = f"Tôi tìm thấy {len(sections)} điều khoản liên quan đến '{query}' trong {doc_info}:\n\n"
+        for i, section in enumerate(sections[:3], 1):
+            section_text = f"{section.section_code}: {section.section_title or ''}\n"
+            section_text += section.content[:200] + "..." if len(section.content) > 200 else section.content
+            message += f"{i}. {section_text}\n\n"
+        if len(sections) > 3:
+            message += f"... và {len(sections) - 3} điều khoản khác."
+        return message
+    def _get_response_template(self, intent: str) -> str:
+        """Get response template for intent."""
+        templates = {
+            "search_fine": "Tôi tìm thấy {count} mức phạt liên quan đến '{query}':",
+            "search_procedure": "Tôi tìm thấy {count} thủ tục liên quan đến '{query}':",
+            "search_office": "Tôi tìm thấy {count} đơn vị liên quan đến '{query}':",
+            "search_advisory": "Tôi tìm thấy {count} cảnh báo liên quan đến '{query}':",
+        }
+        return templates.get(intent, "Tôi tìm thấy {count} kết quả liên quan đến '{query}':")
+# Global chatbot instance
+_chatbot_instance = None
+def get_chatbot() -> Chatbot:
+    """Get or create enhanced chatbot instance."""
+    global _chatbot_instance
+    if _chatbot_instance is None:
+        _chatbot_instance = Chatbot()
+    return _chatbot_instance

backend/hue_portal/chatbot/context_manager.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+Context manager for conversation sessions and messages.
+"""
+from typing import List, Dict, Any, Optional
+from uuid import UUID
+from hue_portal.core.models import ConversationSession, ConversationMessage
+class ConversationContext:
+    """Manages conversation sessions and context."""
+    @staticmethod
+    def get_session(session_id: Optional[str] = None, user_id: Optional[str] = None) -> ConversationSession:
+        """
+        Get or create a conversation session.
+        Args:
+            session_id: Optional session ID (UUID string). If None, creates new session.
+            user_id: Optional user ID for tracking.
+        Returns:
+            ConversationSession instance.
+        """
+        if session_id:
+            try:
+                # Try to get existing session
+                session = ConversationSession.objects.get(session_id=session_id)
+                # Update updated_at timestamp
+                session.save(update_fields=["updated_at"])
+                return session
+            except ConversationSession.DoesNotExist:
+                # Create new session with provided session_id
+                return ConversationSession.objects.create(
+                    session_id=session_id,
+                    user_id=user_id
+                )
+        else:
+            # Create new session
+            return ConversationSession.objects.create(user_id=user_id)
+    @staticmethod
+    def add_message(
+        session_id: str,
+        role: str,
+        content: str,
+        intent: Optional[str] = None,
+        entities: Optional[Dict[str, Any]] = None,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> ConversationMessage:
+        """
+        Add a message to a conversation session.
+        Args:
+            session_id: Session ID (UUID string).
+            role: Message role ('user' or 'bot').
+            content: Message content.
+            intent: Detected intent (optional).
+            entities: Extracted entities (optional).
+            metadata: Additional metadata (optional).
+        Returns:
+            ConversationMessage instance.
+        """
+        session = ConversationContext.get_session(session_id=session_id)
+        return ConversationMessage.objects.create(
+            session=session,
+            role=role,
+            content=content,
+            intent=intent or "",
+            entities=entities or {},
+            metadata=metadata or {}
+        )
+    @staticmethod
+    def get_recent_messages(session_id: str, limit: int = 10) -> List[ConversationMessage]:
+        """
+        Get recent messages from a session.
+        Args:
+            session_id: Session ID (UUID string).
+            limit: Maximum number of messages to return.
+        Returns:
+            List of ConversationMessage instances, ordered by timestamp (oldest first).
+        """
+        try:
+            session = ConversationSession.objects.get(session_id=session_id)
+            return list(session.messages.all()[:limit])
+        except ConversationSession.DoesNotExist:
+            return []
+    @staticmethod
+    def get_context_summary(session_id: str, max_messages: int = 5) -> Dict[str, Any]:
+        """
+        Create a summary of conversation context.
+        Args:
+            session_id: Session ID (UUID string).
+            max_messages: Maximum number of messages to include in summary.
+        Returns:
+            Dictionary with context summary including:
+            - recent_messages: List of recent messages
+            - entities: Aggregated entities from conversation
+            - intents: List of intents mentioned
+            - message_count: Total number of messages
+        """
+        messages = ConversationContext.get_recent_messages(session_id, limit=max_messages)
+        # Aggregate entities
+        all_entities = {}
+        intents = []
+        for msg in messages:
+            if msg.entities:
+                for key, value in msg.entities.items():
+                    if key not in all_entities:
+                        all_entities[key] = []
+                    if value not in all_entities[key]:
+                        all_entities[key].append(value)
+            if msg.intent:
+                if msg.intent not in intents:
+                    intents.append(msg.intent)
+        return {
+            "recent_messages": [
+                {
+                    "role": msg.role,
+                    "content": msg.content,
+                    "intent": msg.intent,
+                    "timestamp": msg.timestamp.isoformat()
+                }
+                for msg in messages
+            ],
+            "entities": all_entities,
+            "intents": intents,
+            "message_count": len(messages)
+        }
+    @staticmethod
+    def extract_entities(query: str) -> Dict[str, Any]:
+        """
+        Extract entities from a query (basic implementation).
+        This is a placeholder - will be enhanced by entity_extraction.py
+        Args:
+            query: User query string.
+        Returns:
+            Dictionary with extracted entities.
+        """
+        entities = {}
+        query_lower = query.lower()
+        # Basic fine code extraction (V001, V002, etc.)
+        import re
+        fine_codes = re.findall(r'\bV\d{3}\b', query, re.IGNORECASE)
+        if fine_codes:
+            entities["fine_codes"] = fine_codes
+        # Basic procedure keywords
+        procedure_keywords = ["thủ tục", "hồ sơ", "giấy tờ"]
+        if any(kw in query_lower for kw in procedure_keywords):
+            entities["has_procedure"] = True
+        # Basic fine keywords
+        fine_keywords = ["phạt", "mức phạt", "vi phạm"]
+        if any(kw in query_lower for kw in fine_keywords):
+            entities["has_fine"] = True
+        return entities
+    @staticmethod
+    def get_session_metadata(session_id: str) -> Dict[str, Any]:
+        """
+        Return metadata stored with the conversation session.
+        """
+        if not session_id:
+            return {}
+        try:
+            session = ConversationSession.objects.get(session_id=session_id)
+            return session.metadata or {}
+        except ConversationSession.DoesNotExist:
+            return {}
+    @staticmethod
+    def update_session_metadata(session_id: str, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Merge provided data into session metadata and persist.
+        """
+        if not session_id:
+            return {}
+        session = ConversationContext.get_session(session_id=session_id)
+        metadata = session.metadata or {}
+        metadata.update(data)
+        session.metadata = metadata
+        session.save(update_fields=["metadata", "updated_at"])
+        return metadata
+    @staticmethod
+    def clear_session_metadata_keys(session_id: str, keys: List[str]) -> Dict[str, Any]:
+        """
+        Remove specific keys from session metadata.
+        """
+        if not session_id:
+            return {}
+        session = ConversationContext.get_session(session_id=session_id)
+        metadata = session.metadata or {}
+        changed = False
+        for key in keys:
+            if key in metadata:
+                metadata.pop(key)
+                changed = True
+        if changed:
+            session.metadata = metadata
+            session.save(update_fields=["metadata", "updated_at"])
+        return metadata

backend/hue_portal/chatbot/dialogue_manager.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""
+Dialogue management for multi-turn conversations.
+"""
+from typing import Dict, Any, Optional, List, Tuple
+from enum import Enum
+class DialogueState(Enum):
+    """Dialogue states."""
+    INITIAL = "initial"
+    COLLECTING_INFO = "collecting_info"
+    CLARIFYING = "clarifying"
+    PROVIDING_ANSWER = "providing_answer"
+    FOLLOW_UP = "follow_up"
+    COMPLETED = "completed"
+class DialogueManager:
+    """Manages dialogue state and multi-turn conversations."""
+    def __init__(self):
+        self.state = DialogueState.INITIAL
+        self.slots = {}  # Slot filling for missing information
+        self.context_switch_detected = False
+    def update_state(
+        self,
+        query: str,
+        intent: str,
+        results_count: int,
+        confidence: float,
+        recent_messages: Optional[List[Dict[str, Any]]] = None
+    ) -> DialogueState:
+        """
+        Update dialogue state based on current query and context.
+        Args:
+            query: Current user query.
+            intent: Detected intent.
+            results_count: Number of results found.
+            confidence: Confidence score.
+            recent_messages: Recent conversation messages.
+        Returns:
+            Updated dialogue state.
+        """
+        # Detect context switching
+        if recent_messages and len(recent_messages) > 0:
+            last_intent = recent_messages[-1].get("intent")
+            if last_intent and last_intent != intent and intent != "greeting":
+                self.context_switch_detected = True
+                self.state = DialogueState.INITIAL
+                self.slots = {}
+                return self.state
+        # State transitions
+        if results_count == 0 and confidence < 0.5:
+            # No results and low confidence - need clarification
+            self.state = DialogueState.CLARIFYING
+        elif results_count > 0 and confidence >= 0.7:
+            # Good results - providing answer
+            self.state = DialogueState.PROVIDING_ANSWER
+        elif results_count > 0 and confidence < 0.7:
+            # Some results but uncertain - might need follow-up
+            self.state = DialogueState.FOLLOW_UP
+        else:
+            self.state = DialogueState.PROVIDING_ANSWER
+        return self.state
+    def needs_clarification(
+        self,
+        query: str,
+        intent: str,
+        results_count: int
+    ) -> Tuple[bool, Optional[str]]:
+        """
+        Check if clarification is needed.
+        Args:
+            query: User query.
+            intent: Detected intent.
+            results_count: Number of results.
+        Returns:
+            Tuple of (needs_clarification, clarification_message).
+        """
+        if results_count == 0:
+            # No results - ask for clarification
+            clarification_messages = {
+                "search_fine": "Bạn có thể cho biết cụ thể hơn về loại vi phạm không? Ví dụ: vượt đèn đỏ, không đội mũ bảo hiểm...",
+                "search_procedure": "Bạn muốn tìm thủ tục nào? Ví dụ: đăng ký cư trú, thủ tục ANTT...",
+                "search_office": "Bạn muốn tìm đơn vị nào? Ví dụ: công an phường, điểm tiếp dân...",
+                "search_advisory": "Bạn muốn tìm cảnh báo về chủ đề gì?",
+            }
+            message = clarification_messages.get(intent, "Bạn có thể cung cấp thêm thông tin không?")
+            return (True, message)
+        return (False, None)
+    def detect_missing_slots(
+        self,
+        intent: str,
+        query: str,
+        results_count: int
+    ) -> Dict[str, Any]:
+        """
+        Detect missing information slots.
+        Args:
+            intent: Detected intent.
+            query: User query.
+            results_count: Number of results.
+        Returns:
+            Dictionary of missing slots.
+        """
+        missing_slots = {}
+        if intent == "search_fine":
+            # Check for fine code or fine name
+            if "v001" not in query.lower() and "v002" not in query.lower():
+                if not any(kw in query.lower() for kw in ["vượt đèn đỏ", "mũ bảo hiểm", "nồng độ cồn"]):
+                    missing_slots["fine_specification"] = True
+        elif intent == "search_procedure":
+            # Check for procedure name or domain
+            if not any(kw in query.lower() for kw in ["cư trú", "antt", "pccc", "đăng ký"]):
+                missing_slots["procedure_specification"] = True
+        elif intent == "search_office":
+            # Check for office name or location
+            if not any(kw in query.lower() for kw in ["phường", "huyện", "tỉnh", "điểm tiếp dân"]):
+                missing_slots["office_specification"] = True
+        return missing_slots
+    def handle_follow_up(
+        self,
+        query: str,
+        recent_messages: List[Dict[str, Any]]
+    ) -> Optional[str]:
+        """
+        Generate follow-up question if needed.
+        Args:
+            query: Current query.
+            recent_messages: Recent conversation messages.
+        Returns:
+            Follow-up question or None.
+        """
+        if not recent_messages:
+            return None
+        # Check if query is very short (likely a follow-up)
+        if len(query.split()) <= 3:
+            last_message = recent_messages[-1]
+            last_intent = last_message.get("intent")
+            if last_intent == "search_fine":
+                return "Bạn muốn biết thêm thông tin gì về mức phạt này? (ví dụ: điều luật, biện pháp khắc phục)"
+            elif last_intent == "search_procedure":
+                return "Bạn muốn biết thêm thông tin gì về thủ tục này? (ví dụ: hồ sơ, lệ phí, thời hạn)"
+        return None
+    def reset(self):
+        """Reset dialogue manager state."""
+        self.state = DialogueState.INITIAL
+        self.slots = {}
+        self.context_switch_detected = False

backend/hue_portal/chatbot/document_topics.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Domain-specific knowledge for clarification prompts.
+"""
+from __future__ import annotations
+from typing import List, Dict
+DOCUMENT_TOPICS: List[Dict[str, str]] = [
+    {
+        "code": "264-QD-TW",
+        "title": "Quy định 264/QĐ-TW (sửa đổi, bổ sung Quy định 69/QĐ-TW)",
+        "doc_type": "Quy định",
+        "summary": "Văn bản của Ban Chấp hành Trung ương về kỷ luật tổ chức đảng, thay thế quy định 69.",
+        "keywords": [
+            "264",
+            "quy định 264",
+            "qd 264",
+            "đảng",
+            "tổ chức đảng",
+            "kỷ luật đảng",
+            "ban chấp hành trung ương",
+        ],
+    },
+    {
+        "code": "QD-69-TW",
+        "title": "Quy định 69/QĐ-TW về kỷ luật tổ chức đảng, đảng viên vi phạm",
+        "doc_type": "Quy định",
+        "summary": "Quy định kỷ luật của Đảng ban hành năm 2022, nền tảng cho xử lý kỷ luật đảng viên.",
+        "keywords": [
+            "69",
+            "qd 69",
+            "quy định 69",
+            "kỷ luật đảng viên",
+            "kỷ luật cán bộ",
+            "vi phạm đảng",
+        ],
+    },
+    {
+        "code": "TT-02-CAND",
+        "title": "Thông tư 02/2021/TT-BCA về xử lý điều lệnh trong Công an nhân dân",
+        "doc_type": "Thông tư",
+        "summary": "Quy định xử lý vi phạm điều lệnh, hạ bậc thi đua đối với đơn vị thuộc CAND.",
+        "keywords": [
+            "thông tư 02",
+            "tt 02",
+            "điều lệnh",
+            "công an",
+            "cand",
+            "thi đua",
+            "đơn vị",
+        ],
+    },
+    {
+        "code": "TT-02-BIEN-SOAN",
+        "title": "Thông tư 02/2018/TT-BCA (Biên soạn) về soạn thảo văn bản",
+        "doc_type": "Thông tư",
+        "summary": "Hướng dẫn biên soạn, trình bày văn bản thuộc Bộ Công an.",
+        "keywords": [
+            "biên soạn",
+            "soạn thảo",
+            "thông tư 02 biên soạn",
+        ],
+    },
+]
+def find_topic_by_code(code: str) -> Dict[str, str] | None:
+    code_upper = code.strip().upper()
+    for topic in DOCUMENT_TOPICS:
+        if topic["code"].upper() == code_upper:
+            return topic
+    return None

backend/hue_portal/chatbot/download_progress.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""
+Download progress tracker for Hugging Face models.
+Tracks real-time download progress in bytes.
+"""
+import threading
+import time
+from typing import Dict, Optional
+from dataclasses import dataclass, field
+@dataclass
+class DownloadProgress:
+    """Track download progress for a single file."""
+    filename: str
+    total_bytes: int = 0
+    downloaded_bytes: int = 0
+    started_at: Optional[float] = None
+    completed_at: Optional[float] = None
+    speed_bytes_per_sec: float = 0.0
+    @property
+    def percentage(self) -> float:
+        """Calculate download percentage."""
+        if self.total_bytes == 0:
+            return 0.0
+        return min(100.0, (self.downloaded_bytes / self.total_bytes) * 100.0)
+    @property
+    def is_complete(self) -> bool:
+        """Check if download is complete."""
+        return self.total_bytes > 0 and self.downloaded_bytes >= self.total_bytes
+    @property
+    def elapsed_time(self) -> float:
+        """Get elapsed time in seconds."""
+        if self.started_at is None:
+            return 0.0
+        end_time = self.completed_at or time.time()
+        return end_time - self.started_at
+@dataclass
+class ModelDownloadProgress:
+    """Track overall download progress for a model."""
+    model_path: str
+    files: Dict[str, DownloadProgress] = field(default_factory=dict)
+    started_at: Optional[float] = None
+    completed_at: Optional[float] = None
+    def update_file(self, filename: str, downloaded: int, total: int):
+        """Update progress for a specific file."""
+        if filename not in self.files:
+            self.files[filename] = DownloadProgress(
+                filename=filename,
+                started_at=time.time()
+            )
+            if self.started_at is None:
+                self.started_at = time.time()
+        file_progress = self.files[filename]
+        file_progress.downloaded_bytes = downloaded
+        file_progress.total_bytes = total
+        # Calculate speed
+        if file_progress.started_at:
+            elapsed = time.time() - file_progress.started_at
+            if elapsed > 0:
+                file_progress.speed_bytes_per_sec = downloaded / elapsed
+        # Mark as complete
+        if total > 0 and downloaded >= total:
+            file_progress.completed_at = time.time()
+    def complete_file(self, filename: str):
+        """Mark a file as complete."""
+        if filename in self.files:
+            self.files[filename].completed_at = time.time()
+    @property
+    def total_bytes(self) -> int:
+        """Get total bytes across all files."""
+        return sum(f.total_bytes for f in self.files.values())
+    @property
+    def downloaded_bytes(self) -> int:
+        """Get downloaded bytes across all files."""
+        return sum(f.downloaded_bytes for f in self.files.values())
+    @property
+    def percentage(self) -> float:
+        """Calculate overall download percentage."""
+        total = self.total_bytes
+        if total == 0:
+            # If no total yet, count completed files
+            if len(self.files) == 0:
+                return 0.0
+            completed = sum(1 for f in self.files.values() if f.is_complete)
+            return (completed / len(self.files)) * 100.0
+        return min(100.0, (self.downloaded_bytes / total) * 100.0)
+    @property
+    def is_complete(self) -> bool:
+        """Check if all files are downloaded."""
+        if len(self.files) == 0:
+            return False
+        return all(f.is_complete for f in self.files.values())
+    @property
+    def speed_bytes_per_sec(self) -> float:
+        """Get overall download speed."""
+        total_speed = sum(f.speed_bytes_per_sec for f in self.files.values() if f.started_at)
+        return total_speed
+    @property
+    def elapsed_time(self) -> float:
+        """Get elapsed time in seconds."""
+        if self.started_at is None:
+            return 0.0
+        end_time = self.completed_at or time.time()
+        return end_time - self.started_at
+    def to_dict(self) -> Dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "model_path": self.model_path,
+            "total_bytes": self.total_bytes,
+            "downloaded_bytes": self.downloaded_bytes,
+            "percentage": round(self.percentage, 2),
+            "speed_bytes_per_sec": round(self.speed_bytes_per_sec, 2),
+            "speed_mb_per_sec": round(self.speed_bytes_per_sec / (1024 * 1024), 2),
+            "elapsed_time": round(self.elapsed_time, 2),
+            "is_complete": self.is_complete,
+            "files_count": len(self.files),
+            "files_completed": sum(1 for f in self.files.values() if f.is_complete),
+            "files": {
+                name: {
+                    "filename": f.filename,
+                    "total_bytes": f.total_bytes,
+                    "downloaded_bytes": f.downloaded_bytes,
+                    "percentage": round(f.percentage, 2),
+                    "speed_mb_per_sec": round(f.speed_bytes_per_sec / (1024 * 1024), 2),
+                    "is_complete": f.is_complete
+                }
+                for name, f in self.files.items()
+            }
+        }
+class ProgressTracker:
+    """Thread-safe progress tracker for multiple models."""
+    def __init__(self):
+        self._progress: Dict[str, ModelDownloadProgress] = {}
+        self._lock = threading.Lock()
+    def get_or_create(self, model_path: str) -> ModelDownloadProgress:
+        """Get or create progress tracker for a model."""
+        with self._lock:
+            if model_path not in self._progress:
+                self._progress[model_path] = ModelDownloadProgress(model_path=model_path)
+            return self._progress[model_path]
+    def get(self, model_path: str) -> Optional[ModelDownloadProgress]:
+        """Get progress tracker for a model."""
+        with self._lock:
+            return self._progress.get(model_path)
+    def update(self, model_path: str, filename: str, downloaded: int, total: int):
+        """Update download progress for a file."""
+        progress = self.get_or_create(model_path)
+        progress.update_file(filename, downloaded, total)
+    def complete_file(self, model_path: str, filename: str):
+        """Mark a file as complete."""
+        progress = self.get(model_path)
+        if progress:
+            progress.complete_file(filename)
+    def complete_model(self, model_path: str):
+        """Mark entire model download as complete."""
+        progress = self.get(model_path)
+        if progress:
+            progress.completed_at = time.time()
+    def get_all(self) -> Dict[str, Dict]:
+        """Get all progress as dictionary."""
+        with self._lock:
+            return {
+                path: prog.to_dict()
+                for path, prog in self._progress.items()
+            }
+    def get_model_progress(self, model_path: str) -> Optional[Dict]:
+        """Get progress for a specific model."""
+        progress = self.get(model_path)
+        if progress:
+            return progress.to_dict()
+        return None
+# Global progress tracker instance
+_global_tracker = ProgressTracker()
+def get_progress_tracker() -> ProgressTracker:
+    """Get global progress tracker instance."""
+    return _global_tracker
+def create_progress_callback(model_path: str):
+    """
+    Create a progress callback for huggingface_hub downloads.
+    Usage:
+        from huggingface_hub import snapshot_download
+        callback = create_progress_callback("Qwen/Qwen2.5-32B-Instruct")
+        snapshot_download(repo_id=model_path, resume_download=True,
+                         tqdm_class=callback)
+    """
+    tracker = get_progress_tracker()
+    class ProgressCallback:
+        """Progress callback for tqdm."""
+        def __init__(self, *args, **kwargs):
+            # Store tqdm arguments but don't initialize yet
+            self.tqdm_args = args
+            self.tqdm_kwargs = kwargs
+            self.current_file = None
+        def __call__(self, *args, **kwargs):
+            # This will be called by huggingface_hub
+            # We'll intercept the progress updates
+            pass
+        def update(self, n: int = 1):
+            """Update progress."""
+            if self.current_file:
+                # Get current progress from tqdm
+                if hasattr(self, 'n'):
+                    downloaded = self.n
+                else:
+                    downloaded = n
+                if hasattr(self, 'total'):
+                    total = self.total
+                else:
+                    total = 0
+                tracker.update(model_path, self.current_file, downloaded, total)
+        def set_description(self, desc: str):
+            """Set description (filename)."""
+            # Extract filename from description
+            if desc:
+                self.current_file = desc.split()[-1] if ' ' in desc else desc
+        def close(self):
+            """Close progress bar."""
+            if self.current_file:
+                tracker.complete_file(model_path, self.current_file)
+    return ProgressCallback
+def create_hf_progress_callback(model_path: str):
+    """
+    Create a progress callback compatible with huggingface_hub.
+    Returns a function that can be used with tqdm.
+    """
+    tracker = get_progress_tracker()
+    current_file = [None]  # Use list to allow modification in nested function
+    def progress_callback(tqdm_bar):
+        """Progress callback function."""
+        if tqdm_bar.desc:
+            # Extract filename from description
+            filename = tqdm_bar.desc.split()[-1] if ' ' in tqdm_bar.desc else tqdm_bar.desc
+            if filename != current_file[0]:
+                current_file[0] = filename
+                if current_file[0] not in tracker.get_or_create(model_path).files:
+                    tracker.get_or_create(model_path).files[current_file[0]] = DownloadProgress(
+                        filename=current_file[0],
+                        started_at=time.time()
+                    )
+        if current_file[0]:
+            downloaded = getattr(tqdm_bar, 'n', 0)
+            total = getattr(tqdm_bar, 'total', 0)
+            tracker.update(model_path, current_file[0], downloaded, total)
+    return progress_callback

backend/hue_portal/chatbot/dual_path_router.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+Dual-Path RAG Router - Routes queries to Fast Path (golden dataset) or Slow Path (full RAG).
+"""
+from __future__ import annotations
+import re
+import unicodedata
+from dataclasses import dataclass
+from typing import Dict, Optional, List, Tuple
+import numpy as np
+from django.db.models import Q
+from hue_portal.core.models import GoldenQuery
+from hue_portal.core.embeddings import get_embedding_model
+@dataclass
+class RouteDecision:
+    """Decision from Dual-Path Router."""
+    path: str  # "fast_path" or "slow_path"
+    method: str  # "keyword" or "llm" or "similarity" or "default"
+    confidence: float
+    matched_golden_query_id: Optional[int] = None
+    similarity_score: Optional[float] = None
+    intent: Optional[str] = None
+    rationale: str = ""
+class KeywordRouter:
+    """Fast keyword-based router to match queries against golden dataset."""
+    def __init__(self):
+        self._normalize_cache = {}
+    def _normalize_query(self, query: str) -> str:
+        """Normalize query for matching (lowercase, remove accents, extra spaces)."""
+        if query in self._normalize_cache:
+            return self._normalize_cache[query]
+        normalized = query.lower().strip()
+        # Remove accents for accent-insensitive matching
+        normalized = unicodedata.normalize("NFD", normalized)
+        normalized = "".join(ch for ch in normalized if unicodedata.category(ch) != "Mn")
+        # Remove extra spaces
+        normalized = re.sub(r'\s+', ' ', normalized).strip()
+        self._normalize_cache[query] = normalized
+        return normalized
+    def route(self, query: str, intent: str, confidence: float) -> RouteDecision:
+        """
+        Try to match query against golden dataset using keyword matching.
+        Returns:
+            RouteDecision with path="fast_path" if match found, else path="slow_path"
+        """
+        query_normalized = self._normalize_query(query)
+        # Try exact match first (fastest)
+        try:
+            golden_query = GoldenQuery.objects.get(
+                query_normalized=query_normalized,
+                is_active=True
+            )
+            return RouteDecision(
+                path="fast_path",
+                method="keyword",
+                confidence=1.0,
+                matched_golden_query_id=golden_query.id,
+                intent=intent,
+                rationale="exact_match"
+            )
+        except (GoldenQuery.DoesNotExist, GoldenQuery.MultipleObjectsReturned):
+            pass
+        # Try fuzzy match: check if query contains golden query or vice versa
+        # This handles variations like "mức phạt vượt đèn đỏ" vs "vượt đèn đỏ phạt bao nhiêu"
+        try:
+            # Find golden queries with same intent
+            golden_queries = GoldenQuery.objects.filter(
+                intent=intent,
+                is_active=True
+            )[:50]  # Limit to avoid too many comparisons
+            for gq in golden_queries:
+                gq_normalized = self._normalize_query(gq.query)
+                # Check if query is substring of golden query or vice versa
+                if (query_normalized in gq_normalized or
+                    gq_normalized in query_normalized):
+                    # Calculate similarity (simple Jaccard similarity)
+                    query_words = set(query_normalized.split())
+                    gq_words = set(gq_normalized.split())
+                    if query_words and gq_words:
+                        similarity = len(query_words & gq_words) / len(query_words | gq_words)
+                        if similarity >= 0.7:  # 70% word overlap
+                            return RouteDecision(
+                                path="fast_path",
+                                method="keyword",
+                                confidence=similarity,
+                                matched_golden_query_id=gq.id,
+                                similarity_score=similarity,
+                                intent=intent,
+                                rationale="fuzzy_match"
+                            )
+        except Exception:
+            pass
+        # No match found
+        return RouteDecision(
+            path="slow_path",
+            method="keyword",
+            confidence=confidence,
+            intent=intent,
+            rationale="no_keyword_match"
+        )
+class DualPathRouter:
+    """Main router that decides Fast Path vs Slow Path using hybrid approach."""
+    def __init__(self, similarity_threshold: float = 0.85):
+        """
+        Initialize Dual-Path Router.
+        Args:
+            similarity_threshold: Minimum similarity score for semantic matching (default: 0.85)
+        """
+        self.keyword_router = KeywordRouter()
+        self.llm_router = None  # Lazy load if needed
+        self.similarity_threshold = similarity_threshold
+        self._embedding_model = None
+    def route(self, query: str, intent: str, confidence: float) -> RouteDecision:
+        """
+        Route query to Fast Path or Slow Path.
+        Args:
+            query: User query string.
+            intent: Detected intent.
+            confidence: Intent classification confidence.
+        Returns:
+            RouteDecision with path, method, and matched golden query ID if applicable.
+        """
+        # Step 1: Keyword-based routing (fastest, ~1-5ms)
+        keyword_decision = self.keyword_router.route(query, intent, confidence)
+        if keyword_decision.path == "fast_path":
+            return keyword_decision
+        # Step 2: Semantic similarity search in golden dataset (~50-100ms)
+        similarity_match = self._find_similar_golden_query(query, intent)
+        if similarity_match and similarity_match['score'] >= self.similarity_threshold:
+            return RouteDecision(
+                path="fast_path",
+                method="similarity",
+                confidence=similarity_match['score'],
+                matched_golden_query_id=similarity_match['id'],
+                similarity_score=similarity_match['score'],
+                intent=intent,
+                rationale="semantic_similarity"
+            )
+        # Step 3: LLM router fallback (for edge cases, ~100-200ms)
+        # Only use if confidence is low (uncertain intent)
+        if confidence < 0.7:
+            llm_decision = self._llm_route(query, intent)
+            if llm_decision and llm_decision.path == "fast_path":
+                return llm_decision
+        # Default: Slow Path (full RAG pipeline)
+        return RouteDecision(
+            path="slow_path",
+            method="default",
+            confidence=confidence,
+            intent=intent,
+            rationale="no_fast_path_match"
+        )
+    def _find_similar_golden_query(self, query: str, intent: str) -> Optional[Dict]:
+        """
+        Find similar query in golden dataset using semantic search.
+        Args:
+            query: User query.
+            intent: Detected intent.
+        Returns:
+            Dict with 'id' and 'score' if match found, None otherwise.
+        """
+        try:
+            # Get active golden queries with same intent
+            golden_queries = list(
+                GoldenQuery.objects.filter(
+                    intent=intent,
+                    is_active=True,
+                    query_embedding__isnull=False
+                )[:100]  # Limit for performance
+            )
+            if not golden_queries:
+                return None
+            # Get embedding model
+            embedding_model = self._get_embedding_model()
+            if not embedding_model:
+                return None
+            # Generate query embedding
+            query_embedding = embedding_model.encode(query, convert_to_numpy=True)
+            query_embedding = query_embedding / np.linalg.norm(query_embedding)  # Normalize
+            # Calculate similarities
+            best_match = None
+            best_score = 0.0
+            for gq in golden_queries:
+                if not gq.query_embedding:
+                    continue
+                # Load golden query embedding
+                gq_embedding = np.array(gq.query_embedding)
+                if len(gq_embedding) == 0:
+                    continue
+                # Normalize
+                gq_embedding = gq_embedding / np.linalg.norm(gq_embedding)
+                # Calculate cosine similarity
+                similarity = float(np.dot(query_embedding, gq_embedding))
+                if similarity > best_score:
+                    best_score = similarity
+                    best_match = gq.id
+            if best_match and best_score >= self.similarity_threshold:
+                return {
+                    'id': best_match,
+                    'score': best_score
+                }
+            return None
+        except Exception as e:
+            # Log error but don't fail
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.warning(f"Error in semantic similarity search: {e}")
+            return None
+    def _get_embedding_model(self):
+        """Lazy load embedding model."""
+        if self._embedding_model is None:
+            self._embedding_model = get_embedding_model()
+        return self._embedding_model
+    def _llm_route(self, query: str, intent: str) -> Optional[RouteDecision]:
+        """
+        Use LLM to decide routing (optional, for edge cases).
+        This is a fallback for low-confidence queries where keyword and similarity
+        didn't find a match, but LLM might recognize it as a common query.
+        Args:
+            query: User query.
+            intent: Detected intent.
+        Returns:
+            RouteDecision if LLM finds a match, None otherwise.
+        """
+        # For now, return None (LLM routing can be implemented later if needed)
+        # This would require a small LLM (7B) to classify if query matches golden dataset
+        return None

backend/hue_portal/chatbot/entity_extraction.py ADDED Viewed

	@@ -0,0 +1,395 @@

+"""
+Entity extraction utilities for extracting fine codes, procedure names, and resolving pronouns.
+"""
+import re
+from typing import List, Dict, Any, Optional, Tuple
+from hue_portal.core.models import Fine, Procedure, Office
+def extract_fine_code(text: str) -> Optional[str]:
+    """
+    Extract fine code (V001, V002, etc.) from text.
+    Args:
+        text: Input text.
+    Returns:
+        Fine code string or None if not found.
+    """
+    # Pattern: V followed by 3 digits
+    pattern = r'\bV\d{3}\b'
+    matches = re.findall(pattern, text, re.IGNORECASE)
+    if matches:
+        return matches[0].upper()
+    return None
+def extract_procedure_name(text: str) -> Optional[str]:
+    """
+    Extract procedure name from text by matching against database.
+    Args:
+        text: Input text.
+    Returns:
+        Procedure name or None if not found.
+    """
+    text_lower = text.lower()
+    # Get all procedures and check for matches
+    procedures = Procedure.objects.all()
+    for procedure in procedures:
+        procedure_title_lower = procedure.title.lower()
+        # Check if procedure title appears in text
+        if procedure_title_lower in text_lower or text_lower in procedure_title_lower:
+            return procedure.title
+    return None
+def extract_office_name(text: str) -> Optional[str]:
+    """
+    Extract office/unit name from text by matching against database.
+    Args:
+        text: Input text.
+    Returns:
+        Office name or None if not found.
+    """
+    text_lower = text.lower()
+    # Get all offices and check for matches
+    offices = Office.objects.all()
+    for office in offices:
+        office_name_lower = office.unit_name.lower()
+        # Check if office name appears in text
+        if office_name_lower in text_lower or text_lower in office_name_lower:
+            return office.unit_name
+    return None
+def extract_reference_pronouns(text: str, context: Optional[List[Dict[str, Any]]] = None) -> List[str]:
+    """
+    Extract reference pronouns from text.
+    Args:
+        text: Input text.
+        context: Optional context from recent messages.
+    Returns:
+        List of pronouns found.
+    """
+    # Vietnamese reference pronouns
+    pronouns = [
+        "cái đó", "cái này", "cái kia",
+        "như vậy", "như thế",
+        "thủ tục đó", "thủ tục này",
+        "mức phạt đó", "mức phạt này",
+        "đơn vị đó", "đơn vị này",
+        "nó", "đó", "này", "kia"
+    ]
+    text_lower = text.lower()
+    found_pronouns = []
+    for pronoun in pronouns:
+        if pronoun in text_lower:
+            found_pronouns.append(pronoun)
+    return found_pronouns
+def enhance_query_with_context(query: str, recent_messages: List[Dict[str, Any]]) -> str:
+    """
+    Enhance query with entities from conversation context.
+    This is more comprehensive than resolve_pronouns - it adds context even when query already has keywords.
+    Args:
+        query: Current query.
+        recent_messages: List of recent messages with role, content, intent, entities.
+    Returns:
+        Enhanced query with context entities added.
+    """
+    if not recent_messages:
+        return query
+    # Collect entities from recent messages (reverse order - most recent first)
+    entities_found = {}
+    for msg in reversed(recent_messages):
+        # Check message content for entities
+        content = msg.get("content", "")
+        # Extract document code (highest priority for legal queries)
+        document_code = extract_document_code(content)
+        if document_code and "document_code" not in entities_found:
+            entities_found["document_code"] = document_code
+        # Extract fine code
+        fine_code = extract_fine_code(content)
+        if fine_code and "fine_code" not in entities_found:
+            entities_found["fine_code"] = fine_code
+        # Extract procedure name
+        procedure_name = extract_procedure_name(content)
+        if procedure_name and "procedure_name" not in entities_found:
+            entities_found["procedure_name"] = procedure_name
+        # Extract office name
+        office_name = extract_office_name(content)
+        if office_name and "office_name" not in entities_found:
+            entities_found["office_name"] = office_name
+        # Check entities field
+        msg_entities = msg.get("entities", {})
+        for key, value in msg_entities.items():
+            if key not in entities_found:
+                entities_found[key] = value
+        # Check intent to infer entity type
+        intent = msg.get("intent", "")
+        if intent == "search_fine" and "fine_name" not in entities_found:
+            # Try to extract fine name from content
+            fine_keywords = ["vượt đèn đỏ", "mũ bảo hiểm", "nồng độ cồn", "t��c độ"]
+            for keyword in fine_keywords:
+                if keyword in content.lower():
+                    entities_found["fine_name"] = keyword
+                    break
+        if intent == "search_procedure" and "procedure_name" not in entities_found:
+            procedure_keywords = ["đăng ký", "thủ tục", "cư trú", "antt", "pccc"]
+            for keyword in procedure_keywords:
+                if keyword in content.lower():
+                    entities_found["procedure_name"] = keyword
+                    break
+        if intent == "search_legal" and "document_code" not in entities_found:
+            # Try to extract document code from content if not already found
+            doc_code = extract_document_code(content)
+            if doc_code:
+                entities_found["document_code"] = doc_code
+    # Enhance query with context entities
+    enhanced_parts = [query]
+    query_lower = query.lower()
+    # If query mentions a document but doesn't have the code, add it from context
+    if "thông tư" in query_lower or "quyết định" in query_lower or "quy định" in query_lower:
+        if "document_code" in entities_found:
+            doc_code = entities_found["document_code"]
+            # Only add if not already in query
+            if doc_code.lower() not in query_lower:
+                enhanced_parts.append(doc_code)
+    # Add document code if intent is legal and code is in context
+    # This helps with follow-up questions like "nói rõ hơn về thông tư 02"
+    if "document_code" in entities_found:
+        doc_code = entities_found["document_code"]
+        if doc_code.lower() not in query_lower:
+            # Add document code to enhance search
+            enhanced_parts.append(doc_code)
+    return " ".join(enhanced_parts)
+def resolve_pronouns(query: str, recent_messages: List[Dict[str, Any]]) -> str:
+    """
+    Resolve pronouns in query by replacing them with actual entities from context.
+    This is a simpler version that only handles pronoun replacement.
+    For comprehensive context enhancement, use enhance_query_with_context().
+    Args:
+        query: Current query with pronouns.
+        recent_messages: List of recent messages with role, content, intent, entities.
+    Returns:
+        Enhanced query with pronouns resolved.
+    """
+    if not recent_messages:
+        return query
+    # Check for pronouns
+    pronouns = extract_reference_pronouns(query)
+    if not pronouns:
+        return query
+    # Look for entities in recent messages (reverse order - most recent first)
+    resolved_query = query
+    entities_found = {}
+    for msg in reversed(recent_messages):
+        # Check message content for entities
+        content = msg.get("content", "")
+        # Extract fine code
+        fine_code = extract_fine_code(content)
+        if fine_code and "fine_code" not in entities_found:
+            entities_found["fine_code"] = fine_code
+        # Extract procedure name
+        procedure_name = extract_procedure_name(content)
+        if procedure_name and "procedure_name" not in entities_found:
+            entities_found["procedure_name"] = procedure_name
+        # Extract office name
+        office_name = extract_office_name(content)
+        if office_name and "office_name" not in entities_found:
+            entities_found["office_name"] = office_name
+        # Extract document code
+        document_code = extract_document_code(content)
+        if document_code and "document_code" not in entities_found:
+            entities_found["document_code"] = document_code
+        # Check entities field
+        msg_entities = msg.get("entities", {})
+        for key, value in msg_entities.items():
+            if key not in entities_found:
+                entities_found[key] = value
+        # Check intent to infer entity type
+        intent = msg.get("intent", "")
+        if intent == "search_fine" and "fine_name" not in entities_found:
+            fine_keywords = ["vượt đèn đỏ", "mũ bảo hiểm", "nồng độ cồn", "tốc độ"]
+            for keyword in fine_keywords:
+                if keyword in content.lower():
+                    entities_found["fine_name"] = keyword
+                    break
+        if intent == "search_procedure" and "procedure_name" not in entities_found:
+            procedure_keywords = ["đăng ký", "thủ tục", "cư trú", "antt", "pccc"]
+            for keyword in procedure_keywords:
+                if keyword in content.lower():
+                    entities_found["procedure_name"] = keyword
+                    break
+    # Replace pronouns with entities
+    query_lower = query.lower()
+    # Replace "cái đó", "cái này", "nó" with most relevant entity
+    if any(pronoun in query_lower for pronoun in ["cái đó", "cái này", "nó", "đó"]):
+        if "document_code" in entities_found:
+            resolved_query = re.sub(
+                r'\b(cái đó|cái này|nó|đó)\b',
+                entities_found["document_code"],
+                resolved_query,
+                flags=re.IGNORECASE
+            )
+        elif "fine_name" in entities_found:
+            resolved_query = re.sub(
+                r'\b(cái đó|cái này|nó|đó)\b',
+                entities_found["fine_name"],
+                resolved_query,
+                flags=re.IGNORECASE
+            )
+        elif "procedure_name" in entities_found:
+            resolved_query = re.sub(
+                r'\b(cái đó|cái này|nó|đó)\b',
+                entities_found["procedure_name"],
+                resolved_query,
+                flags=re.IGNORECASE
+            )
+        elif "office_name" in entities_found:
+            resolved_query = re.sub(
+                r'\b(cái đó|cái này|nó|đó)\b',
+                entities_found["office_name"],
+                resolved_query,
+                flags=re.IGNORECASE
+            )
+    # Replace "thủ tục đó", "thủ tục này" with procedure name
+    if "thủ tục" in query_lower and "procedure_name" in entities_found:
+        resolved_query = re.sub(
+            r'\bthủ tục (đó|này)\b',
+            entities_found["procedure_name"],
+            resolved_query,
+            flags=re.IGNORECASE
+        )
+    # Replace "mức phạt đó", "mức phạt này" with fine name
+    if "mức phạt" in query_lower and "fine_name" in entities_found:
+        resolved_query = re.sub(
+            r'\bmức phạt (đó|này)\b',
+            entities_found["fine_name"],
+            resolved_query,
+            flags=re.IGNORECASE
+        )
+    return resolved_query
+def extract_document_code(text: str) -> Optional[str]:
+    """
+    Extract legal document code from text (e.g., "thông tư 02", "quyết định 264").
+    Args:
+        text: Input text.
+    Returns:
+        Document code string or None if not found.
+    """
+    # Patterns for legal document codes
+    patterns = [
+        r'\bthông tư\s+(\d+[-\w]*)',
+        r'\btt\s+(\d+[-\w]*)',
+        r'\bquyết định\s+(\d+[-\w]*)',
+        r'\bqd\s+(\d+[-\w]*)',
+        r'\bquy định\s+(\d+[-\w]*)',
+        r'\b(\d+[-\w]*)\s*[-/]\s*QĐ[-/]TW',
+        r'\b(\d+[-\w]*)\s*[-/]\s*TT',
+    ]
+    text_lower = text.lower()
+    for pattern in patterns:
+        matches = re.findall(pattern, text_lower, re.IGNORECASE)
+        if matches:
+            # Return the full match with document type
+            full_match = re.search(pattern, text_lower, re.IGNORECASE)
+            if full_match:
+                return full_match.group(0)
+    return None
+def extract_all_entities(text: str) -> Dict[str, Any]:
+    """
+    Extract all entities from text.
+    Args:
+        text: Input text.
+    Returns:
+        Dictionary with all extracted entities.
+    """
+    entities = {}
+    # Extract fine code
+    fine_code = extract_fine_code(text)
+    if fine_code:
+        entities["fine_code"] = fine_code
+    # Extract procedure name
+    procedure_name = extract_procedure_name(text)
+    if procedure_name:
+        entities["procedure_name"] = procedure_name
+    # Extract office name
+    office_name = extract_office_name(text)
+    if office_name:
+        entities["office_name"] = office_name
+    # Extract document code
+    document_code = extract_document_code(text)
+    if document_code:
+        entities["document_code"] = document_code
+    # Extract pronouns
+    pronouns = extract_reference_pronouns(text)
+    if pronouns:
+        entities["pronouns"] = pronouns
+    return entities

backend/hue_portal/chatbot/exact_match_cache.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""
+Exact match cache for caching repeated chatbot responses.
+"""
+from __future__ import annotations
+import copy
+import time
+import unicodedata
+import re
+from collections import OrderedDict
+from typing import Any, Dict, Optional, Tuple
+class ExactMatchCache:
+    """LRU cache that stores full chatbot responses for exact queries."""
+    def __init__(self, max_size: int = 256, ttl_seconds: Optional[int] = 43200):
+        self.max_size = max(1, max_size)
+        self.ttl = ttl_seconds
+        self._store: "OrderedDict[str, Tuple[float, Dict[str, Any]]]" = OrderedDict()
+    def get(self, query: str, intent: Optional[str] = None) -> Optional[Dict[str, Any]]:
+        """Return cached response if still valid."""
+        key = self._make_key(query, intent)
+        record = self._store.get(key)
+        if not record:
+            return None
+        timestamp, payload = record
+        if self.ttl and (time.time() - timestamp) > self.ttl:
+            self._store.pop(key, None)
+            return None
+        self._store.move_to_end(key)
+        return copy.deepcopy(payload)
+    def set(self, query: str, intent: Optional[str], response: Dict[str, Any]) -> None:
+        """Store response for normalized query/int."""
+        key = self._make_key(query, intent)
+        self._store[key] = (time.time(), copy.deepcopy(response))
+        self._store.move_to_end(key)
+        if len(self._store) > self.max_size:
+            self._store.popitem(last=False)
+    def clear(self) -> None:
+        """Remove all cached entries."""
+        self._store.clear()
+    def _make_key(self, query: str, intent: Optional[str]) -> str:
+        normalized_query = self._normalize_query(query or "")
+        normalized_intent = (intent or "").strip().lower()
+        return f"{normalized_intent}::{normalized_query}"
+    def _normalize_query(self, query: str) -> str:
+        """Normalize query for stable caching."""
+        text = query.lower().strip()
+        text = unicodedata.normalize("NFD", text)
+        text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
+        text = re.sub(r"\s+", " ", text)
+        return text

backend/hue_portal/chatbot/fast_path_handler.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Fast Path Handler - Returns cached responses from golden dataset.
+"""
+from typing import Dict, Any
+from hue_portal.core.models import GoldenQuery
+class FastPathHandler:
+    """Handle Fast Path queries using golden dataset."""
+    def handle(self, query: str, golden_query_id: int) -> Dict[str, Any]:
+        """
+        Get cached response from golden dataset.
+        Args:
+            query: User query (for logging).
+            golden_query_id: ID of matched golden query.
+        Returns:
+            Response dict (same format as Slow Path) with additional metadata.
+        """
+        try:
+            golden_query = GoldenQuery.objects.get(id=golden_query_id, is_active=True)
+        except GoldenQuery.DoesNotExist:
+            # Fallback: return error response
+            return {
+                "message": "Xin lỗi, không tìm thấy thông tin trong cơ sở dữ liệu.",
+                "intent": "error",
+                "results": [],
+                "count": 0,
+                "_source": "fast_path",
+                "_error": "golden_query_not_found"
+            }
+        # Increment usage count (async update for performance)
+        golden_query.usage_count += 1
+        golden_query.save(update_fields=['usage_count'])
+        # Return cached response
+        response = golden_query.response_data.copy()
+        # Add metadata
+        response['_source'] = 'fast_path'
+        response['_golden_query_id'] = golden_query_id
+        response['_verified_by'] = golden_query.verified_by
+        response['_accuracy_score'] = golden_query.accuracy_score
+        # Ensure required fields exist
+        if 'message' not in response:
+            response['message'] = golden_query.response_message
+        if 'intent' not in response:
+            response['intent'] = golden_query.intent
+        if 'count' not in response:
+            response['count'] = len(response.get('results', []))
+        return response

backend/hue_portal/chatbot/legal_guardrails.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+Guardrails RAIL schema and helpers for structured legal answers.
+"""
+from __future__ import annotations
+from functools import lru_cache
+from pathlib import Path
+from typing import Dict, Optional
+from guardrails import Guard
+SCHEMA_DIR = Path(__file__).resolve().parent / "schemas"
+RAIL_PATH = SCHEMA_DIR / "legal_answer.rail"
+@lru_cache(maxsize=1)
+def get_legal_guard() -> Guard:
+    """Return cached Guard instance for legal answers."""
+    return Guard.from_rail(rail_file=str(RAIL_PATH))
+def ensure_schema_files() -> Optional[Dict[str, str]]:
+    """
+    Return metadata for the legal RAIL schema to help packaging.
+    Called during setup to make sure the schema file is discovered by tools
+    such as setup scripts or bundlers.
+    """
+    if RAIL_PATH.exists():
+        return {"legal_rail": str(RAIL_PATH)}
+    return None

backend/hue_portal/chatbot/llm_integration.py ADDED Viewed

	@@ -0,0 +1,1746 @@

+"""
+LLM integration for natural answer generation.
+Supports OpenAI GPT, Anthropic Claude, Ollama, Hugging Face Inference API, Local Hugging Face models, and API mode.
+"""
+import os
+import re
+import json
+import sys
+import traceback
+import logging
+import time
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Set, Tuple
+from .structured_legal import (
+    build_structured_legal_prompt,
+    get_legal_output_parser,
+    parse_structured_output,
+    LegalAnswer,
+)
+from .legal_guardrails import get_legal_guard
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass  # dotenv is optional
+logger = logging.getLogger(__name__)
+BASE_DIR = Path(__file__).resolve().parents[2]
+GUARDRAILS_LOG_DIR = BASE_DIR / "logs" / "guardrails"
+GUARDRAILS_LOG_FILE = GUARDRAILS_LOG_DIR / "legal_structured.log"
+def _write_guardrails_debug(label: str, content: Optional[str]) -> None:
+    """Persist raw Guardrails inputs/outputs for debugging."""
+    if not content:
+        return
+    try:
+        GUARDRAILS_LOG_DIR.mkdir(parents=True, exist_ok=True)
+        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+        snippet = content.strip()
+        max_len = 4000
+        if len(snippet) > max_len:
+            snippet = snippet[:max_len] + "...[truncated]"
+        with GUARDRAILS_LOG_FILE.open("a", encoding="utf-8") as fp:
+            fp.write(f"[{timestamp}] [{label}] {snippet}\n{'-' * 80}\n")
+    except Exception as exc:
+        logger.debug("Unable to write guardrails log: %s", exc)
+def _collect_doc_metadata(documents: List[Any]) -> Tuple[Set[str], Set[str]]:
+    titles: Set[str] = set()
+    sections: Set[str] = set()
+    for doc in documents:
+        document = getattr(doc, "document", None)
+        title = getattr(document, "title", None)
+        if title:
+            titles.add(title.strip())
+        section_code = getattr(doc, "section_code", None)
+        if section_code:
+            sections.add(section_code.strip())
+    return titles, sections
+def _contains_any(text: str, tokens: Set[str]) -> bool:
+    if not tokens:
+        return True
+    normalized = text.lower()
+    return any(token.lower() in normalized for token in tokens if token)
+def _validate_structured_answer(
+    answer: "LegalAnswer",
+    documents: List[Any],
+) -> Tuple[bool, str]:
+    """Ensure structured answer references actual documents/sections."""
+    allowed_titles, allowed_sections = _collect_doc_metadata(documents)
+    if allowed_titles and not _contains_any(answer.summary, allowed_titles):
+        return False, "Summary thiếu tên văn bản từ bảng tham chiếu"
+    for idx, bullet in enumerate(answer.details, 1):
+        if allowed_titles and not _contains_any(bullet, allowed_titles):
+            return False, f"Chi tiết {idx} thiếu tên văn bản"
+        if allowed_sections and not _contains_any(bullet, allowed_sections):
+            return False, f"Chi tiết {idx} thiếu mã điều/khoản"
+    allowed_title_lower = {title.lower() for title in allowed_titles}
+    allowed_section_lower = {section.lower() for section in allowed_sections}
+    for idx, citation in enumerate(answer.citations, 1):
+        if citation.document_title and citation.document_title.lower() not in allowed_title_lower:
+            return False, f"Citation {idx} chứa văn bản không có trong nguồn"
+        if (
+            citation.section_code
+            and allowed_section_lower
+            and citation.section_code.lower() not in allowed_section_lower
+        ):
+            return False, f"Citation {idx} chứa điều/khoản không có trong nguồn"
+    return True, ""
+# Import download progress tracker (optional)
+try:
+    from .download_progress import get_progress_tracker, DownloadProgress
+    PROGRESS_TRACKER_AVAILABLE = True
+except ImportError:
+    PROGRESS_TRACKER_AVAILABLE = False
+    logger.warning("Download progress tracker not available")
+# LLM Provider types
+LLM_PROVIDER_OPENAI = "openai"
+LLM_PROVIDER_ANTHROPIC = "anthropic"
+LLM_PROVIDER_OLLAMA = "ollama"
+LLM_PROVIDER_HUGGINGFACE = "huggingface"  # Hugging Face Inference API
+LLM_PROVIDER_LOCAL = "local"  # Local Hugging Face Transformers model
+LLM_PROVIDER_LLAMA_CPP = "llama_cpp"  # GGUF via llama.cpp
+LLM_PROVIDER_API = "api"  # API mode - call HF Spaces API
+LLM_PROVIDER_NONE = "none"
+# Get provider from environment (default to llama.cpp Gemma if none provided)
+DEFAULT_LLM_PROVIDER = os.environ.get(
+    "DEFAULT_LLM_PROVIDER",
+    LLM_PROVIDER_LLAMA_CPP,
+).lower()
+env_provider = os.environ.get("LLM_PROVIDER", "").strip().lower()
+LLM_PROVIDER = env_provider or DEFAULT_LLM_PROVIDER
+LLM_MODE = os.environ.get("LLM_MODE", "answer").strip().lower() or "answer"
+LEGAL_STRUCTURED_MAX_ATTEMPTS = max(
+    1, int(os.environ.get("LEGAL_STRUCTURED_MAX_ATTEMPTS", "2"))
+)
+class LLMGenerator:
+    """Generate natural language answers using LLMs."""
+    # Class-level cache for llama.cpp model (shared across all instances in same process)
+    _llama_cpp_shared = None
+    _llama_cpp_model_path_shared = None
+    def __init__(self, provider: Optional[str] = None):
+        """
+        Initialize LLM generator.
+        Args:
+            provider: LLM provider ('openai', 'anthropic', 'ollama', 'local', 'huggingface', 'api', or None for auto-detect).
+        """
+        self.provider = provider or LLM_PROVIDER
+        self.llm_mode = LLM_MODE if LLM_MODE in {"keywords", "answer"} else "answer"
+        self.client = None
+        self.local_model = None
+        self.local_tokenizer = None
+        self.llama_cpp = None
+        self.llama_cpp_model_path = None
+        self.api_base_url = None
+        self._initialize_client()
+    def _initialize_client(self):
+        """Initialize LLM client based on provider."""
+        if self.provider == LLM_PROVIDER_OPENAI:
+            try:
+                import openai
+                api_key = os.environ.get("OPENAI_API_KEY")
+                if api_key:
+                    self.client = openai.OpenAI(api_key=api_key)
+                    print("✅ OpenAI client initialized")
+                else:
+                    print("⚠️ OPENAI_API_KEY not found, OpenAI disabled")
+            except ImportError:
+                print("⚠️ openai package not installed, install with: pip install openai")
+        elif self.provider == LLM_PROVIDER_ANTHROPIC:
+            try:
+                import anthropic
+                api_key = os.environ.get("ANTHROPIC_API_KEY")
+                if api_key:
+                    self.client = anthropic.Anthropic(api_key=api_key)
+                    print("✅ Anthropic client initialized")
+                else:
+                    print("⚠️ ANTHROPIC_API_KEY not found, Anthropic disabled")
+            except ImportError:
+                print("⚠️ anthropic package not installed, install with: pip install anthropic")
+        elif self.provider == LLM_PROVIDER_OLLAMA:
+            self.ollama_base_url = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
+            self.ollama_model = os.environ.get("OLLAMA_MODEL", "qwen2.5:7b")
+            print(f"✅ Ollama configured (base_url: {self.ollama_base_url}, model: {self.ollama_model})")
+        elif self.provider == LLM_PROVIDER_HUGGINGFACE:
+            self.hf_api_key = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
+            self.hf_model = os.environ.get("HF_MODEL", "Qwen/Qwen2.5-7B-Instruct")
+            if self.hf_api_key:
+                print(f"✅ Hugging Face API configured (model: {self.hf_model})")
+            else:
+                print("⚠️ HF_TOKEN not found, Hugging Face may have rate limits")
+        elif self.provider == LLM_PROVIDER_API:
+            # API mode - call HF Spaces API
+            self.api_base_url = os.environ.get(
+                "HF_API_BASE_URL",
+                "https://davidtran999-hue-portal-backend.hf.space/api"
+            )
+            print(f"✅ API mode configured (base_url: {self.api_base_url})")
+        elif self.provider == LLM_PROVIDER_LLAMA_CPP:
+            self._initialize_llama_cpp_model()
+        elif self.provider == LLM_PROVIDER_LOCAL:
+            self._initialize_local_model()
+        else:
+            print("ℹ️ No LLM provider configured, using template-based generation")
+    def _initialize_local_model(self):
+        """Initialize local Hugging Face Transformers model."""
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+            import torch
+            # Default to Qwen 2.5 7B with 8-bit quantization (fits in GPU RAM)
+            model_path = os.environ.get("LOCAL_MODEL_PATH", "Qwen/Qwen2.5-7B-Instruct")
+            device = os.environ.get("LOCAL_MODEL_DEVICE", "auto")  # auto, cpu, cuda
+            print(f"[LLM] Loading local model: {model_path}", flush=True)
+            logger.info(f"[LLM] Loading local model: {model_path}")
+            # Determine device
+            if device == "auto":
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            # Start cache monitoring for download progress (optional)
+            try:
+                from .cache_monitor import get_cache_monitor
+                monitor = get_cache_monitor()
+                monitor.start_monitoring(model_path, interval=2.0)
+                print(f"[LLM] 📊 Started cache monitoring for {model_path}", flush=True)
+                logger.info(f"[LLM] 📊 Started cache monitoring for {model_path}")
+            except Exception as e:
+                logger.warning(f"Could not start cache monitoring: {e}")
+            # Load tokenizer
+            print("[LLM] Loading tokenizer...", flush=True)
+            logger.info("[LLM] Loading tokenizer...")
+            try:
+                self.local_tokenizer = AutoTokenizer.from_pretrained(
+                    model_path,
+                    trust_remote_code=True
+                )
+                print("[LLM] ✅ Tokenizer loaded successfully", flush=True)
+                logger.info("[LLM] ✅ Tokenizer loaded successfully")
+            except Exception as tokenizer_err:
+                error_trace = traceback.format_exc()
+                print(f"[LLM] ❌ Tokenizer load error: {tokenizer_err}", flush=True)
+                print(f"[LLM] ❌ Tokenizer trace: {error_trace}", flush=True)
+                logger.error(f"[LLM] ❌ Tokenizer load error: {tokenizer_err}\n{error_trace}")
+                print(f"[LLM] ❌ ERROR: {type(tokenizer_err).__name__}: {str(tokenizer_err)}", file=sys.stderr, flush=True)
+                traceback.print_exc(file=sys.stderr)
+                raise
+            # Load model with optional quantization and fallback mechanism
+            print(f"[LLM] Loading model to {device}...", flush=True)
+            logger.info(f"[LLM] Loading model to {device}...")
+            # Check for quantization config
+            # Default to 8-bit for 7B (better thinking), 4-bit for larger models
+            default_8bit = "7b" in model_path.lower() or "7B" in model_path
+            default_4bit = ("32b" in model_path.lower() or "32B" in model_path or "14b" in model_path.lower() or "14B" in model_path) and not default_8bit
+            # Check environment variable for explicit quantization preference
+            quantization_pref = os.environ.get("LOCAL_MODEL_QUANTIZATION", "").lower()
+            if quantization_pref == "4bit":
+                use_8bit = False
+                use_4bit = True
+            elif quantization_pref == "8bit":
+                use_8bit = True
+                use_4bit = False
+            elif quantization_pref == "none":
+                use_8bit = False
+                use_4bit = False
+            else:
+                # Use defaults based on model size
+                use_8bit = os.environ.get("LOCAL_MODEL_8BIT", "true" if default_8bit else "false").lower() == "true"
+                use_4bit = os.environ.get("LOCAL_MODEL_4BIT", "true" if default_4bit else "false").lower() == "true"
+            # Try loading with fallback: 8-bit → 4-bit → float16
+            model_loaded = False
+            quantization_attempts = []
+            if device == "cuda":
+                # Attempt 1: Try 8-bit quantization (if requested)
+                if use_8bit:
+                    quantization_attempts.append(("8-bit", True, False))
+                # Attempt 2: Try 4-bit quantization (if 8-bit fails or not requested)
+                if use_4bit or (use_8bit and not model_loaded):
+                    quantization_attempts.append(("4-bit", False, True))
+                # Attempt 3: Fallback to float16 (no quantization)
+                quantization_attempts.append(("float16", False, False))
+            else:
+                # CPU: only float32
+                quantization_attempts.append(("float32", False, False))
+            last_error = None
+            for attempt_name, try_8bit, try_4bit in quantization_attempts:
+                if model_loaded:
+                    break
+                try:
+                    load_kwargs = {
+                        "trust_remote_code": True,
+                        "low_cpu_mem_usage": True,
+                    }
+                    if device == "cuda":
+                        load_kwargs["device_map"] = "auto"
+                        if try_4bit:
+                            # Check if bitsandbytes is available
+                            try:
+                                import bitsandbytes as bnb
+                                from transformers import BitsAndBytesConfig
+                                load_kwargs["quantization_config"] = BitsAndBytesConfig(
+                                    load_in_4bit=True,
+                                    bnb_4bit_compute_dtype=torch.float16
+                                )
+                                print(f"[LLM] Attempting to load with 4-bit quantization (~4-5GB VRAM for 7B)", flush=True)
+                            except ImportError:
+                                print(f"[LLM] ⚠️ bitsandbytes not available, skipping 4-bit quantization", flush=True)
+                                raise ImportError("bitsandbytes not available")
+                        elif try_8bit:
+                            from transformers import BitsAndBytesConfig
+                            # Fixed: Remove CPU offload to avoid Int8Params compatibility issue
+                            load_kwargs["quantization_config"] = BitsAndBytesConfig(
+                                load_in_8bit=True,
+                                llm_int8_threshold=6.0
+                                # Removed: llm_int8_enable_fp32_cpu_offload=True (causes compatibility issues)
+                            )
+                            # Removed: max_memory override - let accelerate handle it automatically
+                            print(f"[LLM] Attempting to load with 8-bit quantization (~7GB VRAM for 7B)", flush=True)
+                        else:
+                            load_kwargs["torch_dtype"] = torch.float16
+                            print(f"[LLM] Attempting to load with float16 (no quantization)", flush=True)
+                    else:
+                        load_kwargs["torch_dtype"] = torch.float32
+                        print(f"[LLM] Attempting to load with float32 (CPU)", flush=True)
+                    # Load model
+                    self.local_model = AutoModelForCausalLM.from_pretrained(
+                        model_path,
+                        **load_kwargs
+                    )
+                    # Stop cache monitoring (download complete)
+                    try:
+                        from .cache_monitor import get_cache_monitor
+                        monitor = get_cache_monitor()
+                        monitor.stop_monitoring(model_path)
+                        print(f"[LLM] ✅ Model download complete, stopped monitoring", flush=True)
+                    except:
+                        pass
+                    print(f"[LLM] ✅ Model loaded successfully with {attempt_name} quantization", flush=True)
+                    logger.info(f"[LLM] ✅ Model loaded successfully with {attempt_name} quantization")
+                    # Optional: Compile model for faster inference (PyTorch 2.0+)
+                    try:
+                        if hasattr(torch, "compile") and device == "cuda":
+                            print(f"[LLM] ⚡ Compiling model for faster inference...", flush=True)
+                            self.local_model = torch.compile(self.local_model, mode="reduce-overhead")
+                            print(f"[LLM] ✅ Model compiled successfully", flush=True)
+                            logger.info(f"[LLM] ✅ Model compiled for faster inference")
+                    except Exception as compile_err:
+                        print(f"[LLM] ⚠️ Model compilation skipped: {compile_err}", flush=True)
+                        # Continue without compilation
+                    model_loaded = True
+                except Exception as model_load_err:
+                    last_error = model_load_err
+                    error_trace = traceback.format_exc()
+                    print(f"[LLM] ⚠️ Failed to load with {attempt_name}: {model_load_err}", flush=True)
+                    logger.warning(f"[LLM] ⚠️ Failed to load with {attempt_name}: {model_load_err}")
+                    # If this was the last attempt, raise the error
+                    if attempt_name == quantization_attempts[-1][0]:
+                        print(f"[LLM] ❌ All quantization attempts failed. Last error: {model_load_err}", flush=True)
+                        print(f"[LLM] ❌ Model load trace: {error_trace}", flush=True)
+                        logger.error(f"[LLM] ❌ Model load error: {model_load_err}\n{error_trace}")
+                        print(f"[LLM] ❌ ERROR: {type(model_load_err).__name__}: {str(model_load_err)}", file=sys.stderr, flush=True)
+                        traceback.print_exc(file=sys.stderr)
+                        raise
+                    else:
+                        # Try next quantization method
+                        print(f"[LLM] 🔄 Falling back to next quantization method...", flush=True)
+                        continue
+            if not model_loaded:
+                raise RuntimeError("Failed to load model with any quantization method")
+            if device == "cpu":
+                try:
+                    self.local_model = self.local_model.to(device)
+                    print(f"[LLM] ✅ Model moved to {device}", flush=True)
+                    logger.info(f"[LLM] ✅ Model moved to {device}")
+                except Exception as move_err:
+                    error_trace = traceback.format_exc()
+                    print(f"[LLM] ❌ Model move error: {move_err}", flush=True)
+                    logger.error(f"[LLM] ❌ Model move error: {move_err}\n{error_trace}")
+                    print(f"[LLM] ❌ ERROR: {type(move_err).__name__}: {str(move_err)}", file=sys.stderr, flush=True)
+                    traceback.print_exc(file=sys.stderr)
+            self.local_model.eval()  # Set to evaluation mode
+            print(f"[LLM] ✅ Local model loaded successfully on {device}", flush=True)
+            logger.info(f"[LLM] ✅ Local model loaded successfully on {device}")
+        except ImportError as import_err:
+            error_msg = "transformers package not installed, install with: pip install transformers torch"
+            print(f"[LLM] ⚠️ {error_msg}", flush=True)
+            logger.warning(f"[LLM] ⚠️ {error_msg}")
+            print(f"[LLM] ❌ ImportError: {import_err}", file=sys.stderr, flush=True)
+            self.local_model = None
+            self.local_tokenizer = None
+        except Exception as e:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ Error loading local model: {e}", flush=True)
+            print(f"[LLM] ❌ Full trace: {error_trace}", flush=True)
+            logger.error(f"[LLM] ❌ Error loading local model: {e}\n{error_trace}")
+            print(f"[LLM] ❌ ERROR: {type(e).__name__}: {str(e)}", file=sys.stderr, flush=True)
+            traceback.print_exc(file=sys.stderr)
+            print("[LLM] 💡 Tip: Use smaller models like Qwen/Qwen2.5-1.5B-Instruct or Qwen/Qwen2.5-0.5B-Instruct", flush=True)
+            self.local_model = None
+            self.local_tokenizer = None
+    def _initialize_llama_cpp_model(self) -> None:
+        """Initialize llama.cpp runtime for GGUF inference."""
+        # Use shared model if available (singleton pattern for process-level reuse)
+        if LLMGenerator._llama_cpp_shared is not None:
+            self.llama_cpp = LLMGenerator._llama_cpp_shared
+            self.llama_cpp_model_path = LLMGenerator._llama_cpp_model_path_shared
+            print("[LLM] ♻️ Reusing shared llama.cpp model (kept alive)", flush=True)
+            logger.debug("[LLM] Reusing shared llama.cpp model (kept alive)")
+            return
+        # Skip if instance model already loaded
+        if self.llama_cpp is not None:
+            print("[LLM] ♻️ llama.cpp model already loaded, skipping re-initialization", flush=True)
+            logger.debug("[LLM] llama.cpp model already loaded, skipping re-initialization")
+            return
+        try:
+            from llama_cpp import Llama
+        except ImportError:
+            print("⚠️ llama-cpp-python not installed. Run: pip install llama-cpp-python", flush=True)
+            logger.warning("llama-cpp-python not installed")
+            return
+        model_path = os.environ.get(
+            "LLAMA_CPP_MODEL_PATH",
+            # Mặc định trỏ tới file GGUF local trong backend/models
+            str(BASE_DIR / "models" / "gemma-2b-it-Q5_K_M.gguf"),
+        )
+        resolved_path = self._resolve_llama_cpp_model_path(model_path)
+        if not resolved_path:
+            print("❌ Unable to resolve GGUF model path for llama.cpp", flush=True)
+            logger.error("Unable to resolve GGUF model path for llama.cpp")
+            return
+        # CPU-friendly defaults: smaller context/batch to reduce latency/RAM
+        n_ctx = int(os.environ.get("LLAMA_CPP_CONTEXT", "8192"))
+        n_threads = int(os.environ.get("LLAMA_CPP_THREADS", "4"))
+        n_batch = int(os.environ.get("LLAMA_CPP_BATCH", "1024"))
+        n_gpu_layers = int(os.environ.get("LLAMA_CPP_GPU_LAYERS", "0"))
+        use_mmap = os.environ.get("LLAMA_CPP_USE_MMAP", "true").lower() == "true"
+        use_mlock = os.environ.get("LLAMA_CPP_USE_MLOCK", "true").lower() == "true"
+        rope_freq_base = os.environ.get("LLAMA_CPP_ROPE_FREQ_BASE")
+        rope_freq_scale = os.environ.get("LLAMA_CPP_ROPE_FREQ_SCALE")
+        llama_kwargs = {
+            "model_path": resolved_path,
+            "n_ctx": n_ctx,
+            "n_batch": n_batch,
+            "n_threads": n_threads,
+            "n_gpu_layers": n_gpu_layers,
+            "use_mmap": use_mmap,
+            "use_mlock": use_mlock,
+            "logits_all": False,
+        }
+        if rope_freq_base and rope_freq_scale:
+            try:
+                llama_kwargs["rope_freq_base"] = float(rope_freq_base)
+                llama_kwargs["rope_freq_scale"] = float(rope_freq_scale)
+            except ValueError:
+                logger.warning("Invalid rope frequency overrides, ignoring custom values.")
+        try:
+            print(f"[LLM] Loading llama.cpp model: {resolved_path}", flush=True)
+            logger.info("[LLM] Loading llama.cpp model from %s", resolved_path)
+            self.llama_cpp = Llama(**llama_kwargs)
+            self.llama_cpp_model_path = resolved_path
+            # Store in shared cache for reuse across instances
+            LLMGenerator._llama_cpp_shared = self.llama_cpp
+            LLMGenerator._llama_cpp_model_path_shared = resolved_path
+            print(
+                f"[LLM] ✅ llama.cpp ready (ctx={n_ctx}, threads={n_threads}, batch={n_batch}) - Model cached for reuse",
+                flush=True,
+            )
+            logger.info(
+                "[LLM] ✅ llama.cpp ready (ctx=%s, threads=%s, batch=%s)",
+                n_ctx,
+                n_threads,
+                n_batch,
+            )
+        except Exception as exc:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ Failed to load llama.cpp model: {exc}", flush=True)
+            print(f"[LLM] ❌ Trace: {error_trace}", flush=True)
+            logger.error("Failed to load llama.cpp model: %s\n%s", exc, error_trace)
+            self.llama_cpp = None
+    def _resolve_llama_cpp_model_path(self, configured_path: str) -> Optional[str]:
+        """Resolve GGUF model path, downloading from Hugging Face if needed."""
+        potential_path = Path(configured_path)
+        if potential_path.is_file():
+            logger.info(f"[LLM] Using existing model file: {potential_path}")
+            return str(potential_path)
+        repo_id = os.environ.get(
+            "LLAMA_CPP_MODEL_REPO",
+            "QuantFactory/gemma-2-2b-it-GGUF",
+        )
+        filename = os.environ.get(
+            "LLAMA_CPP_MODEL_FILE",
+            "gemma-2-2b-it-Q5_K_M.gguf",
+        )
+        cache_dir = Path(os.environ.get("LLAMA_CPP_CACHE_DIR", BASE_DIR / "models"))
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        # Check if file already exists in cache_dir (avoid re-downloading)
+        cached_file = cache_dir / filename
+        if cached_file.is_file():
+            logger.info(f"[LLM] Using cached model file: {cached_file}")
+            print(f"[LLM] ✅ Found cached model: {cached_file}", flush=True)
+            return str(cached_file)
+        try:
+            from huggingface_hub import hf_hub_download
+        except ImportError:
+            print("⚠️ huggingface_hub not installed. Run: pip install huggingface_hub", flush=True)
+            logger.warning("huggingface_hub not installed")
+            return None
+        try:
+            print(f"[LLM] Downloading model from Hugging Face: {repo_id}/{filename}", flush=True)
+            logger.info(f"[LLM] Downloading model from Hugging Face: {repo_id}/{filename}")
+            # hf_hub_download has built-in caching - won't re-download if file exists in HF cache
+            downloaded_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=filename,
+                local_dir=str(cache_dir),
+                local_dir_use_symlinks=False,
+                # Force download only if file doesn't exist (hf_hub_download checks cache automatically)
+            )
+            print(f"[LLM] ✅ Model downloaded/cached: {downloaded_path}", flush=True)
+            logger.info(f"[LLM] ✅ Model downloaded/cached: {downloaded_path}")
+            return downloaded_path
+        except Exception as exc:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ Failed to download GGUF model: {exc}", flush=True)
+            print(f"[LLM] ❌ Trace: {error_trace}", flush=True)
+            logger.error("Failed to download GGUF model: %s\n%s", exc, error_trace)
+            return None
+    def is_available(self) -> bool:
+        """Check if LLM is available."""
+        return (
+            self.client is not None
+            or self.provider == LLM_PROVIDER_OLLAMA
+            or self.provider == LLM_PROVIDER_HUGGINGFACE
+            or self.provider == LLM_PROVIDER_API
+            or (self.provider == LLM_PROVIDER_LOCAL and self.local_model is not None)
+            or (self.provider == LLM_PROVIDER_LLAMA_CPP and self.llama_cpp is not None)
+        )
+    def generate_answer(
+        self,
+        query: str,
+        context: Optional[List[Dict[str, Any]]] = None,
+        documents: Optional[List[Any]] = None
+    ) -> Optional[str]:
+        """
+        Generate natural language answer from documents.
+        Args:
+            query: User query.
+            context: Optional conversation context.
+            documents: Retrieved documents.
+        Returns:
+            Generated answer or None if LLM not available.
+        """
+        if not self.is_available():
+            return None
+        prompt = self._build_prompt(query, context, documents)
+        return self._generate_from_prompt(prompt, context=context)
+    def _build_prompt(
+        self,
+        query: str,
+        context: Optional[List[Dict[str, Any]]],
+        documents: Optional[List[Any]]
+    ) -> str:
+        """Build prompt for LLM."""
+        prompt_parts = [
+            "Bạn là chuyên gia tư vấn về xử lí kỷ luật cán bộ đảng viên của Phòng Thanh Tra - Công An Thành Phố Huế.",
+            "Nhiệm vụ: Trả lời câu hỏi của người dùng dựa trên các văn bản quy định pháp luật về xử lí kỷ luật cán bộ đảng viên được cung cấp.",
+            "",
+            f"Câu hỏi của người dùng: {query}",
+            ""
+        ]
+        if context:
+            prompt_parts.append("Ngữ cảnh cuộc hội thoại trước đó:")
+            for msg in context[-3:]:  # Last 3 messages
+                role = "Người dùng" if msg.get("role") == "user" else "Bot"
+                content = msg.get("content", "")
+                prompt_parts.append(f"{role}: {content}")
+            prompt_parts.append("")
+        if documents:
+            prompt_parts.append("Các văn bản/quy định liên quan:")
+            # 4 chunks for good context and speed balance
+            for i, doc in enumerate(documents[:4], 1):
+                # Extract relevant fields based on document type
+                doc_text = self._format_document(doc)
+                prompt_parts.append(f"{i}. {doc_text}")
+            prompt_parts.append("")
+            # If documents exist, require strict adherence
+            prompt_parts.extend([
+                "Yêu cầu QUAN TRỌNG:",
+                "- CHỈ trả lời dựa trên thông tin trong 'Các văn bản/quy định liên quan' ở trên",
+                "- KHÔNG được tự tạo hoặc suy đoán thông tin không có trong tài liệu",
+                "- Khi đã có trích đoạn, phải tổng hợp theo cấu trúc rõ ràng:\n  1) Tóm tắt ngắn gọn nội dung chính\n  2) Liệt kê từng điều/khoản hoặc hình thức xử lý (dùng bullet/đánh số, ghi rõ Điều, Khoản, trang, tên văn bản)\n  3) Kết luận + khuyến nghị áp dụng.",
+                "- Luôn nhắc tên văn bản (ví dụ: Quyết định 69/QĐ-TW) và mã điều trong nội dung trả lời.",
+                "- Kết thúc phần trả lời bằng câu: '(Xem trích dẫn chi tiết bên dưới)'.",
+                "- Không dùng những câu chung chung như 'Rất tiếc' hay 'Tôi không thể giúp', hãy trả lời thẳng vào câu hỏi.",
+                "- Chỉ khi HOÀN TOÀN không có thông tin trong tài liệu mới được nói: 'Thông tin trong cơ sở dữ liệu chưa đủ để trả lời câu hỏi này'",
+                "- Nếu có mức phạt, phải ghi rõ số tiền (ví dụ: 200.000 - 400.000 VNĐ)",
+                "- Nếu có điều khoản, ghi rõ mã điều (ví dụ: Điều 5, Điều 10)",
+                "- Nếu có thủ tục, ghi rõ hồ sơ, lệ phí, thời hạn",
+                "- Trả lời bằng tiếng Việt, ngắn gọn, dễ hiểu",
+                "",
+                "Trả lời:"
+            ])
+        else:
+            # No documents - allow general conversation
+            prompt_parts.extend([
+                "Yêu cầu:",
+                "- Trả lời câu hỏi một cách tự nhiên và hữu ích như một chatbot AI thông thường.",
+                "- Phản hồi phải có ít nhất 2 đoạn (mỗi đoạn ≥ 2 câu) và tổng cộng ≥ 6 câu.",
+                "- Luôn có ít nhất 1 danh sách bullet hoặc đánh số để người dùng dễ làm theo.",
+                "- Với chủ đề đời sống (ẩm thực, sức khỏe, du lịch, công nghệ...), hãy đưa ra gợi ý thật đầy đủ, gồm tối thiểu 4-6 câu hoặc 2 đoạn nội dung.",
+                "- Nếu câu hỏi cần công thức/nấu ăn: liệt kê NGUYÊN LIỆU rõ ràng (dạng bullet) và CÁC BƯỚC chi tiết (đánh số 1,2,3...). Đề xuất thêm mẹo hoặc biến tấu phù hợp.",
+                "- Với các chủ đề mẹo vặt khác, hãy chia nhỏ câu trả lời thành từng phần (Ví dụ: Bối cảnh → Các bước → Lưu ý).",
+                "- Tuyệt đối không mở đầu bằng lời xin lỗi hoặc từ chối; hãy đi thẳng vào nội dung chính.",
+                "- Nếu câu hỏi liên quan đến pháp luật, thủ tục, mức phạt nhưng không có thông tin trong cơ sở dữ liệu, hãy nói: 'Tôi không tìm thấy thông tin này trong cơ sở dữ liệu. Bạn có thể liên hệ trực tiếp với Công an thành phố Huế để được tư vấn chi tiết hơn.'",
+                "- Giữ giọng điệu thân thiện, khích lệ, giống một người bạn hiểu biết.",
+                "- Trả lời bằng tiếng Việt, mạch lạc, dễ hiểu, ưu tiên trình bày có tiêu đề/phân đoạn để người đọc dễ làm theo.",
+                "",
+                "Trả lời:"
+            ])
+        return "\n".join(prompt_parts)
+    def _generate_from_prompt(
+        self,
+        prompt: str,
+        context: Optional[List[Dict[str, Any]]] = None,
+        llm_mode: Optional[str] = None,
+    ) -> Optional[str]:
+        """Run current provider with a fully formatted prompt."""
+        mode = (llm_mode or self.llm_mode or "answer").strip().lower()
+        if mode not in {"keywords", "answer"}:
+            mode = "answer"
+        if not self.is_available():
+            return None
+        try:
+            print(f"[LLM] Generating answer with provider: {self.provider}", flush=True)
+            logger.info(f"[LLM] Generating answer with provider: {self.provider}")
+            if self.provider == LLM_PROVIDER_OPENAI:
+                result = self._generate_openai(prompt)
+            elif self.provider == LLM_PROVIDER_ANTHROPIC:
+                result = self._generate_anthropic(prompt)
+            elif self.provider == LLM_PROVIDER_OLLAMA:
+                result = self._generate_ollama(prompt)
+            elif self.provider == LLM_PROVIDER_HUGGINGFACE:
+                result = self._generate_huggingface(prompt, mode)
+            elif self.provider == LLM_PROVIDER_LOCAL:
+                result = self._generate_local(prompt, mode)
+            elif self.provider == LLM_PROVIDER_LLAMA_CPP:
+                result = self._generate_llama_cpp(prompt, mode)
+            elif self.provider == LLM_PROVIDER_API:
+                result = self._generate_api(prompt, context)
+            else:
+                result = None
+            if result:
+                print(
+                    f"[LLM] ✅ Answer generated successfully (length: {len(result)})",
+                    flush=True,
+                )
+                logger.info(
+                    f"[LLM] ✅ Answer generated successfully (length: {len(result)})"
+                )
+            else:
+                print(f"[LLM] ⚠️ No answer generated", flush=True)
+                logger.warning("[LLM] ⚠️ No answer generated")
+            return result
+        except Exception as exc:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ Error generating answer: {exc}", flush=True)
+            print(f"[LLM] ❌ Full trace: {error_trace}", flush=True)
+            logger.error(f"[LLM] ❌ Error generating answer: {exc}\n{error_trace}")
+            print(
+                f"[LLM] ❌ ERROR: {type(exc).__name__}: {str(exc)}",
+                file=sys.stderr,
+                flush=True,
+            )
+            traceback.print_exc(file=sys.stderr)
+            return None
+    def suggest_clarification_topics(
+        self,
+        query: str,
+        candidates: List[Dict[str, Any]],
+        max_options: int = 3,
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Ask the LLM to propose clarification options based on candidate documents.
+        """
+        if not candidates or not self.is_available():
+            return None
+        candidate_lines = []
+        for idx, candidate in enumerate(candidates[: max_options + 2], 1):
+            title = candidate.get("title") or candidate.get("code") or "Văn bản"
+            summary = candidate.get("summary") or candidate.get("section_title") or ""
+            doc_type = candidate.get("doc_type") or ""
+            candidate_lines.append(
+                f"{idx}. {candidate.get('code', '').upper()} – {title}\n"
+                f"   Loại: {doc_type or 'không rõ'}; Tóm tắt: {summary[:200] or 'Không có'}"
+            )
+        prompt = (
+            "Bạn là trợ lý pháp luật. Người dùng vừa hỏi:\n"
+            f"\"{query.strip()}\"\n\n"
+            "Đây là các văn bản ứng viên có thể liên quan:\n"
+            f"{os.linesep.join(candidate_lines)}\n\n"
+            "Hãy chọn tối đa {max_options} văn bản quan trọng cần người dùng xác nhận để tôi tra cứu chính xác.\n"
+            "Yêu cầu trả về JSON với dạng:\n"
+            "{\n"
+            '  "message": "Câu nhắc người dùng bằng tiếng Việt",\n'
+            '  "options": [\n'
+            '    {"code": "MÃ VĂN BẢN", "title": "Tên văn bản", "reason": "Lý do gợi ý"},\n'
+            "    ...\n"
+            "  ]\n"
+            "}\n"
+            "Chỉ in JSON, không thêm lời giải thích khác."
+        ).format(max_options=max_options)
+        raw = self._generate_from_prompt(prompt, llm_mode="keywords")
+        if not raw:
+            return None
+        parsed = self._extract_json_payload(raw)
+        if not parsed:
+            return None
+        options = parsed.get("options") or []
+        sanitized_options = []
+        for option in options:
+            code = (option.get("code") or "").strip()
+            title = (option.get("title") or "").strip()
+            if not code or not title:
+                continue
+            sanitized_options.append(
+                {
+                    "code": code.upper(),
+                    "title": title,
+                    "reason": (option.get("reason") or "").strip(),
+                }
+            )
+            if len(sanitized_options) >= max_options:
+                break
+        if not sanitized_options:
+            return None
+        message = (parsed.get("message") or "Tôi cần bạn chọn văn bản muốn tra cứu chi tiết hơn.").strip()
+        return {"message": message, "options": sanitized_options}
+    def suggest_topic_options(
+        self,
+        query: str,
+        document_code: str,
+        document_title: str,
+        search_results: List[Dict[str, Any]],
+        conversation_context: Optional[List[Dict[str, str]]] = None,
+        max_options: int = 3,
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Ask the LLM to propose topic/section options within a selected document.
+        Args:
+            query: Original user query
+            document_code: Selected document code
+            document_title: Selected document title
+            search_results: Pre-searched sections from the document
+            conversation_context: Recent conversation history
+            max_options: Maximum number of options to return
+        Returns:
+            Dict with message, options, and search_keywords
+        """
+        if not self.is_available():
+            return None
+        # Build context summary
+        context_summary = ""
+        if conversation_context:
+            recent_messages = conversation_context[-3:]  # Last 3 messages
+            context_summary = "\n".join([
+                f"{msg.get('role', 'user')}: {msg.get('content', '')[:100]}"
+                for msg in recent_messages
+            ])
+        # Format search results as candidates
+        candidate_lines = []
+        for idx, result in enumerate(search_results[:max_options + 2], 1):
+            section_title = result.get("section_title") or result.get("title") or ""
+            article = result.get("article") or result.get("article_number") or ""
+            excerpt = result.get("excerpt") or result.get("body") or ""
+            if excerpt:
+                excerpt = excerpt[:150] + "..." if len(excerpt) > 150 else excerpt
+            candidate_lines.append(
+                f"{idx}. {section_title or article or 'Điều khoản'}\n"
+                f"   {'Điều: ' + article if article else ''}\n"
+                f"   Nội dung: {excerpt[:200] or 'Không có'}"
+            )
+        prompt = (
+            "Bạn là trợ lý pháp luật. Người dùng đã chọn văn bản:\n"
+            f"- Mã: {document_code}\n"
+            f"- Tên: {document_title}\n\n"
+            f"Câu hỏi ban đầu của người dùng: \"{query.strip()}\"\n\n"
+        )
+        if context_summary:
+            prompt += (
+                f"Lịch sử hội thoại gần đây:\n{context_summary}\n\n"
+            )
+        prompt += (
+            "Đây là các điều khoản/chủ đề trong văn bản có thể liên quan:\n"
+            f"{os.linesep.join(candidate_lines)}\n\n"
+            f"Hãy chọn tối đa {max_options} chủ đề/điều khoản quan trọng nhất cần người dùng xác nhận.\n"
+            "Yêu cầu trả về JSON với dạng:\n"
+            "{\n"
+            '  "message": "Câu nhắc người dùng bằng tiếng Việt",\n'
+            '  "options": [\n'
+            '    {"title": "Tên chủ đề/điều khoản", "article": "Điều X", "reason": "Lý do gợi ý", "keywords": ["từ", "khóa", "tìm", "kiếm"]},\n'
+            "    ...\n"
+            "  ],\n"
+            '  "search_keywords": ["từ", "khóa", "chính", "để", "tìm", "kiếm"]\n'
+            "}\n"
+            "Trong đó:\n"
+            "- options: Danh sách chủ đề/điều khoản để người dùng chọn\n"
+            "- search_keywords: Danh sách từ khóa quan trọng để tìm kiếm thông tin liên quan\n"
+            "- Mỗi option nên có keywords riêng để tìm kiếm chính xác hơn\n"
+            "Chỉ in JSON, không thêm lời giải thích khác."
+        )
+        raw = self._generate_from_prompt(prompt, llm_mode="keywords")
+        if not raw:
+            return None
+        parsed = self._extract_json_payload(raw)
+        if not parsed:
+            return None
+        options = parsed.get("options") or []
+        sanitized_options = []
+        for option in options:
+            title = (option.get("title") or "").strip()
+            if not title:
+                continue
+            sanitized_options.append({
+                "title": title,
+                "article": (option.get("article") or "").strip(),
+                "reason": (option.get("reason") or "").strip(),
+                "keywords": option.get("keywords") or [],
+            })
+            if len(sanitized_options) >= max_options:
+                break
+        if not sanitized_options:
+            return None
+        message = (parsed.get("message") or f"Bạn muốn tìm điều khoản/chủ đề nào cụ thể trong {document_title}?").strip()
+        search_keywords = parsed.get("search_keywords") or []
+        return {
+            "message": message,
+            "options": sanitized_options,
+            "search_keywords": search_keywords,
+        }
+    def suggest_detail_options(
+        self,
+        query: str,
+        selected_document_code: str,
+        selected_topic: str,
+        conversation_context: Optional[List[Dict[str, str]]] = None,
+        max_options: int = 3,
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Ask the LLM to propose detail options for further clarification.
+        Args:
+            query: Original user query
+            selected_document_code: Selected document code
+            selected_topic: Selected topic/section
+            conversation_context: Recent conversation history
+            max_options: Maximum number of options to return
+        Returns:
+            Dict with message, options, and search_keywords
+        """
+        if not self.is_available():
+            return None
+        # Build context summary
+        context_summary = ""
+        if conversation_context:
+            recent_messages = conversation_context[-5:]  # Last 5 messages
+            context_summary = "\n".join([
+                f"{msg.get('role', 'user')}: {msg.get('content', '')[:100]}"
+                for msg in recent_messages
+            ])
+        prompt = (
+            "Bạn là trợ lý pháp luật. Người dùng đã:\n"
+            f"1. Chọn văn bản: {selected_document_code}\n"
+            f"2. Chọn chủ đề: {selected_topic}\n\n"
+            f"Câu hỏi ban đầu: \"{query.strip()}\"\n\n"
+        )
+        if context_summary:
+            prompt += (
+                f"Lịch sử hội thoại:\n{context_summary}\n\n"
+            )
+        prompt += (
+            "Người dùng muốn biết thêm chi tiết về chủ đề này.\n"
+            f"Hãy đề xuất tối đa {max_options} khía cạnh/chi tiết cụ thể mà người dùng có thể muốn biết.\n"
+            "Yêu cầu trả về JSON với dạng:\n"
+            "{\n"
+            '  "message": "Câu hỏi xác nhận bằng tiếng Việt",\n'
+            '  "options": [\n'
+            '    {"title": "Khía cạnh/chi tiết", "reason": "Lý do gợi ý", "keywords": ["từ", "khóa"]},\n'
+            "    ...\n"
+            "  ],\n"
+            '  "search_keywords": ["từ", "khóa", "tìm", "kiếm"]\n'
+            "}\n"
+            "Chỉ in JSON, không thêm lời giải thích khác."
+        )
+        raw = self._generate_from_prompt(prompt, llm_mode="keywords")
+        if not raw:
+            return None
+        parsed = self._extract_json_payload(raw)
+        if not parsed:
+            return None
+        options = parsed.get("options") or []
+        sanitized_options = []
+        for option in options:
+            title = (option.get("title") or "").strip()
+            if not title:
+                continue
+            sanitized_options.append({
+                "title": title,
+                "reason": (option.get("reason") or "").strip(),
+                "keywords": option.get("keywords") or [],
+            })
+            if len(sanitized_options) >= max_options:
+                break
+        if not sanitized_options:
+            return None
+        message = (parsed.get("message") or "Bạn muốn chi tiết gì cho chủ đề này nữa không?").strip()
+        search_keywords = parsed.get("search_keywords") or []
+        return {
+            "message": message,
+            "options": sanitized_options,
+            "search_keywords": search_keywords,
+        }
+    def extract_search_keywords(
+        self,
+        query: str,
+        selected_options: Optional[List[Dict[str, Any]]] = None,
+        conversation_context: Optional[List[Dict[str, str]]] = None,
+    ) -> List[str]:
+        """
+        Intelligently extract search keywords from query, selected options, and context.
+        Args:
+            query: Original user query
+            selected_options: List of selected options (document, topic, etc.)
+            conversation_context: Recent conversation history
+        Returns:
+            List of extracted keywords for search optimization
+        """
+        if not self.is_available():
+            # Fallback to simple keyword extraction
+            return self._fallback_keyword_extraction(query)
+        # Build context
+        context_text = query
+        if selected_options:
+            for opt in selected_options:
+                title = opt.get("title") or opt.get("code") or ""
+                reason = opt.get("reason") or ""
+                keywords = opt.get("keywords") or []
+                if title:
+                    context_text += f" {title}"
+                if reason:
+                    context_text += f" {reason}"
+                if keywords:
+                    context_text += f" {' '.join(keywords)}"
+        if conversation_context:
+            recent_user_messages = [
+                msg.get("content", "")
+                for msg in conversation_context[-3:]
+                if msg.get("role") == "user"
+            ]
+            context_text += " " + " ".join(recent_user_messages)
+        prompt = (
+            "Bạn là trợ lý pháp luật. Tôi cần bạn trích xuất các từ khóa quan trọng để tìm kiếm thông tin.\n\n"
+            f"Ngữ cảnh: {context_text[:500]}\n\n"
+            "Hãy trích xuất 5-10 từ khóa quan trọng nhất (tiếng Việt) để tìm kiếm.\n"
+            "Yêu cầu trả về JSON với dạng:\n"
+            "{\n"
+            '  "keywords": ["từ", "khóa", "quan", "trọng"]\n'
+            "}\n"
+            "Chỉ in JSON, không thêm lời giải thích khác."
+        )
+        raw = self._generate_from_prompt(prompt, llm_mode="keywords")
+        if not raw:
+            return self._fallback_keyword_extraction(query)
+        parsed = self._extract_json_payload(raw)
+        if not parsed:
+            return self._fallback_keyword_extraction(query)
+        keywords = parsed.get("keywords") or []
+        if isinstance(keywords, list) and len(keywords) > 0:
+            # Filter out stopwords and short words
+            filtered_keywords = [
+                kw.strip().lower()
+                for kw in keywords
+                if kw and len(kw.strip()) > 2
+            ]
+            return filtered_keywords[:10]  # Limit to 10 keywords
+        return self._fallback_keyword_extraction(query)
+    def _fallback_keyword_extraction(self, query: str) -> List[str]:
+        """Fallback keyword extraction using simple rule-based method."""
+        # Simple Vietnamese stopwords
+        stopwords = {
+            "và", "của", "cho", "với", "trong", "là", "có", "được", "bị", "sẽ",
+            "thì", "mà", "này", "đó", "nào", "gì", "như", "về", "từ", "đến",
+            "các", "những", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám",
+            "chín", "mười", "nhiều", "ít", "rất", "quá", "cũng", "đã", "sẽ",
+        }
+        words = query.lower().split()
+        keywords = [
+            w.strip()
+            for w in words
+            if w.strip() not in stopwords and len(w.strip()) > 2
+        ]
+        return keywords[:10]
+    def _extract_json_payload(self, raw: str) -> Optional[Dict[str, Any]]:
+        """Best-effort extraction of JSON object from raw LLM text."""
+        if not raw:
+            return None
+        raw = raw.strip()
+        for snippet in (raw, self._slice_to_json(raw)):
+            if not snippet:
+                continue
+            try:
+                return json.loads(snippet)
+            except Exception:
+                continue
+        return None
+    def _slice_to_json(self, text: str) -> Optional[str]:
+        start = text.find("{")
+        end = text.rfind("}")
+        if start == -1 or end == -1 or end <= start:
+            return None
+        return text[start : end + 1]
+    def generate_structured_legal_answer(
+        self,
+        query: str,
+        documents: List[Any],
+        prefill_summary: Optional[str] = None,
+    ) -> Optional[LegalAnswer]:
+        """
+        Ask the LLM for a structured legal answer (summary + details + citations).
+        """
+        if not self.is_available() or not documents:
+            return None
+        parser = get_legal_output_parser()
+        guard = get_legal_guard()
+        retry_hint: Optional[str] = None
+        failure_reason: Optional[str] = None
+        for attempt in range(LEGAL_STRUCTURED_MAX_ATTEMPTS):
+            prompt = build_structured_legal_prompt(
+                query,
+                documents,
+                parser,
+                prefill_summary=prefill_summary,
+                retry_hint=retry_hint,
+            )
+            logger.debug(
+                "[LLM] Structured prompt preview (attempt %s): %s",
+                attempt + 1,
+                prompt[:600].replace("\n", " "),
+            )
+            raw_output = self._generate_from_prompt(prompt)
+            if not raw_output:
+                failure_reason = "LLM không trả lời"
+                retry_hint = (
+                    "Lần trước bạn không trả về JSON nào. "
+                    "Hãy in duy nhất một JSON với SUMMARY, DETAILS và CITATIONS."
+                )
+                continue
+            _write_guardrails_debug(
+                f"raw_output_attempt_{attempt + 1}",
+                raw_output,
+            )
+            structured: Optional[LegalAnswer] = None
+            try:
+                guard_result = guard.parse(llm_output=raw_output)
+                guarded_output = getattr(guard_result, "validated_output", None)
+                if guarded_output:
+                    structured = LegalAnswer.parse_obj(guarded_output)
+                    _write_guardrails_debug(
+                        f"guard_validated_attempt_{attempt + 1}",
+                        json.dumps(guarded_output, ensure_ascii=False),
+                    )
+            except Exception as exc:
+                failure_reason = f"Guardrails: {exc}"
+                logger.warning("[LLM] Guardrails validation failed: %s", exc)
+                _write_guardrails_debug(
+                    f"guard_error_attempt_{attempt + 1}",
+                    f"{type(exc).__name__}: {exc}",
+                )
+            if not structured:
+                structured = parse_structured_output(parser, raw_output or "")
+                if structured:
+                    _write_guardrails_debug(
+                        f"parser_recovery_attempt_{attempt + 1}",
+                        structured.model_dump_json(indent=None, ensure_ascii=False),
+                    )
+                else:
+                    retry_hint = (
+                        "JSON chưa hợp lệ. Hãy dùng cấu trúc SUMMARY/DETAILS/CITATIONS như ví dụ."
+                    )
+                    continue
+            is_valid, validation_reason = _validate_structured_answer(structured, documents)
+            if is_valid:
+                return structured
+            failure_reason = validation_reason or "Không đạt yêu cầu kiểm tra nội dung"
+            logger.warning(
+                "[LLM] ❌ Structured answer failed validation: %s", failure_reason
+            )
+            retry_hint = (
+                f"Lần trước vi phạm: {failure_reason}. "
+                "Hãy dùng đúng tên văn bản và mã điều trong bảng tham chiếu, không bịa thông tin mới."
+            )
+        logger.warning(
+            "[LLM] ❌ Structured legal parsing failed sau %s lần. Lý do cuối: %s",
+            LEGAL_STRUCTURED_MAX_ATTEMPTS,
+            failure_reason,
+        )
+        return None
+    def _format_document(self, doc: Any) -> str:
+        """Format document for prompt."""
+        doc_type = type(doc).__name__.lower()
+        if "fine" in doc_type:
+            parts = [f"Mức phạt: {getattr(doc, 'name', '')}"]
+            if hasattr(doc, 'code') and doc.code:
+                parts.append(f"Mã: {doc.code}")
+            if hasattr(doc, 'min_fine') and hasattr(doc, 'max_fine'):
+                if doc.min_fine and doc.max_fine:
+                    parts.append(f"Số tiền: {doc.min_fine:,.0f} - {doc.max_fine:,.0f} VNĐ")
+            return " | ".join(parts)
+        elif "procedure" in doc_type:
+            parts = [f"Thủ tục: {getattr(doc, 'title', '')}"]
+            if hasattr(doc, 'dossier') and doc.dossier:
+                parts.append(f"Hồ sơ: {doc.dossier}")
+            if hasattr(doc, 'fee') and doc.fee:
+                parts.append(f"Lệ phí: {doc.fee}")
+            return " | ".join(parts)
+        elif "office" in doc_type:
+            parts = [f"Đơn vị: {getattr(doc, 'unit_name', '')}"]
+            if hasattr(doc, 'address') and doc.address:
+                parts.append(f"Địa chỉ: {doc.address}")
+            if hasattr(doc, 'phone') and doc.phone:
+                parts.append(f"Điện thoại: {doc.phone}")
+            return " | ".join(parts)
+        elif "advisory" in doc_type:
+            parts = [f"Cảnh báo: {getattr(doc, 'title', '')}"]
+            if hasattr(doc, 'summary') and doc.summary:
+                parts.append(f"Nội dung: {doc.summary[:200]}")
+            return " | ".join(parts)
+        elif "legalsection" in doc_type or "legal" in doc_type:
+            parts = []
+            if hasattr(doc, 'section_code') and doc.section_code:
+                parts.append(f"Điều khoản: {doc.section_code}")
+            if hasattr(doc, 'section_title') and doc.section_title:
+                parts.append(f"Tiêu đề: {doc.section_title}")
+            if hasattr(doc, 'document') and doc.document:
+                doc_obj = doc.document
+                if hasattr(doc_obj, 'title'):
+                    parts.append(f"Văn bản: {doc_obj.title}")
+                if hasattr(doc_obj, 'code'):
+                    parts.append(f"Mã văn bản: {doc_obj.code}")
+            if hasattr(doc, 'content') and doc.content:
+                # Provide longer snippet so LLM has enough context (up to ~1500 chars)
+                max_len = 1500
+                snippet = doc.content[:max_len].strip()
+                if len(doc.content) > max_len:
+                    snippet += "..."
+                parts.append(f"Nội dung: {snippet}")
+            return " | ".join(parts) if parts else str(doc)
+        return str(doc)
+    def _generate_openai(self, prompt: str) -> Optional[str]:
+        """Generate answer using OpenAI."""
+        if not self.client:
+            return None
+        try:
+            response = self.client.chat.completions.create(
+                model=os.environ.get("OPENAI_MODEL", "gpt-3.5-turbo"),
+                messages=[
+                    {"role": "system", "content": "Bạn là chuyên gia tư vấn về xử lí kỷ luật cán bộ đảng viên của Phòng Thanh Tra - Công An Thành Phố Huế. Bạn giúp người dùng tra cứu các văn bản quy định pháp luật về xử lí kỷ luật cán bộ đảng viên."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.7,
+                max_tokens=500
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            print(f"OpenAI API error: {e}")
+            return None
+    def _generate_anthropic(self, prompt: str) -> Optional[str]:
+        """Generate answer using Anthropic Claude."""
+        if not self.client:
+            return None
+        try:
+            message = self.client.messages.create(
+                model=os.environ.get("ANTHROPIC_MODEL", "claude-3-5-sonnet-20241022"),
+                max_tokens=500,
+                messages=[
+                    {"role": "user", "content": prompt}
+                ]
+            )
+            return message.content[0].text
+        except Exception as e:
+            print(f"Anthropic API error: {e}")
+            return None
+    def _generate_ollama(self, prompt: str) -> Optional[str]:
+        """Generate answer using Ollama (local LLM)."""
+        try:
+            import requests
+            model = getattr(self, 'ollama_model', os.environ.get("OLLAMA_MODEL", "qwen2.5:7b"))
+            response = requests.post(
+                f"{self.ollama_base_url}/api/generate",
+                json={
+                    "model": model,
+                    "prompt": prompt,
+                    "stream": False,
+                    "options": {
+                        "temperature": 0.7,
+                        "top_p": 0.9,
+                        "num_predict": 500
+                    }
+                },
+                timeout=60
+            )
+            if response.status_code == 200:
+                return response.json().get("response")
+            return None
+        except Exception as e:
+            print(f"Ollama API error: {e}")
+            return None
+    def _generate_huggingface(self, prompt: str, mode: str = "answer") -> Optional[str]:
+        """Generate answer using Hugging Face Inference API."""
+        try:
+            import requests
+            api_url = f"https://api-inference.huggingface.co/models/{self.hf_model}"
+            headers = {}
+            if hasattr(self, 'hf_api_key') and self.hf_api_key:
+                headers["Authorization"] = f"Bearer {self.hf_api_key}"
+            response = requests.post(
+                api_url,
+                headers=headers,
+                json={
+                    "inputs": prompt,
+                    "parameters": {
+                        "temperature": 0.2 if mode == "keywords" else 0.7,
+                        "max_new_tokens": 80 if mode == "keywords" else 256,
+                        "return_full_text": False
+                    }
+                },
+                timeout=60
+            )
+            if response.status_code == 200:
+                result = response.json()
+                if isinstance(result, list) and len(result) > 0:
+                    return result[0].get("generated_text", "")
+                elif isinstance(result, dict):
+                    return result.get("generated_text", "")
+            elif response.status_code == 503:
+                # Model is loading, wait and retry
+                print("⚠️ Model is loading, please wait...")
+                return None
+            else:
+                print(f"Hugging Face API error: {response.status_code} - {response.text}")
+            return None
+        except Exception as e:
+            print(f"Hugging Face API error: {e}")
+            return None
+    def _generate_local(self, prompt: str, mode: str = "answer") -> Optional[str]:
+        """Generate answer using local Hugging Face Transformers model."""
+        if self.local_model is None or self.local_tokenizer is None:
+            return None
+        try:
+            import torch
+            # Format prompt for Qwen models
+            if mode == "keywords":
+                system_content = (
+                    "Bạn là trợ lý trích xuất từ khóa. Nhận câu hỏi pháp lý và "
+                    "chỉ trả về 5-8 từ khóa tiếng Việt, phân tách bằng dấu phẩy. "
+                    "Không viết câu đầy đủ, không thêm lời giải thích."
+                )
+            else:
+                system_content = (
+                    "Bạn là chuyên gia tư vấn pháp luật. Trả lời tự nhiên, ngắn gọn, "
+                    "dựa trên thông tin đã cho."
+                )
+            messages = [
+                {"role": "system", "content": system_content},
+                {"role": "user", "content": prompt},
+            ]
+            # Apply chat template if available
+            if hasattr(self.local_tokenizer, "apply_chat_template"):
+                text = self.local_tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+            else:
+                text = prompt
+            # Tokenize
+            inputs = self.local_tokenizer(text, return_tensors="pt")
+            # Move to device
+            device = next(self.local_model.parameters()).device
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            # Generate with optimized parameters for faster inference
+            with torch.no_grad():
+                # Use greedy decoding for faster generation (can switch to sampling if needed)
+                outputs = self.local_model.generate(
+                    **inputs,
+                    max_new_tokens=80 if mode == "keywords" else 256,
+                    temperature=0.2 if mode == "keywords" else 0.6,
+                    top_p=0.7 if mode == "keywords" else 0.85,
+                    do_sample=True,
+                    use_cache=True,  # Enable KV cache for faster generation
+                    pad_token_id=self.local_tokenizer.eos_token_id,
+                    repetition_penalty=1.05 if mode == "keywords" else 1.1,
+                )
+            # Decode
+            generated_text = self.local_tokenizer.decode(
+                outputs[0][inputs["input_ids"].shape[1]:],
+                skip_special_tokens=True
+            )
+            return generated_text.strip()
+        except TypeError as e:
+            # Check for Int8Params compatibility error
+            if "_is_hf_initialized" in str(e) or "Int8Params" in str(e):
+                error_msg = (
+                    f"[LLM] ❌ Int8Params compatibility error: {e}\n"
+                    f"[LLM] 💡 This error occurs when using 8-bit quantization with incompatible library versions.\n"
+                    f"[LLM] 💡 Solutions:\n"
+                    f"[LLM]   1. Set LOCAL_MODEL_QUANTIZATION=4bit to use 4-bit quantization instead\n"
+                    f"[LLM]   2. Set LOCAL_MODEL_QUANTIZATION=none to disable quantization\n"
+                    f"[LLM]   3. Use API mode (LLM_PROVIDER=api) to avoid local model issues\n"
+                    f"[LLM]   4. Use a smaller model like Qwen/Qwen2.5-1.5B-Instruct"
+                )
+                print(error_msg, flush=True)
+                logger.error(f"[LLM] ❌ Int8Params compatibility error: {e}")
+                print(f"[LLM] ❌ ERROR: {type(e).__name__}: {str(e)}", file=sys.stderr, flush=True)
+                return None
+            else:
+                # Other TypeError, re-raise to be caught by general handler
+                raise
+        except Exception as e:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ Local model generation error: {e}", flush=True)
+            print(f"[LLM] ❌ Full trace: {error_trace}", flush=True)
+            logger.error(f"[LLM] ❌ Local model generation error: {e}\n{error_trace}")
+            print(f"[LLM] ❌ ERROR: {type(e).__name__}: {str(e)}", file=sys.stderr, flush=True)
+            traceback.print_exc(file=sys.stderr)
+            return None
+    def _generate_llama_cpp(self, prompt: str, mode: str = "answer") -> Optional[str]:
+        """Generate answer using llama.cpp GGUF runtime."""
+        if self.llama_cpp is None:
+            return None
+        try:
+            if mode == "keywords":
+                temperature = float(os.environ.get("LLAMA_CPP_TEMPERATURE_KW", "0.2"))
+                top_p = float(os.environ.get("LLAMA_CPP_TOP_P_KW", "0.7"))
+                max_tokens = int(os.environ.get("LLAMA_CPP_MAX_TOKENS_KW", "80"))
+                repeat_penalty = float(os.environ.get("LLAMA_CPP_REPEAT_PENALTY_KW", "1.05"))
+                system_prompt = os.environ.get(
+                    "LLAMA_CPP_SYSTEM_PROMPT_KW",
+                    (
+                        "Bạn là trợ lý trích xuất từ khóa. Nhiệm vụ: nhận câu hỏi pháp lý "
+                        "và chỉ trả về 5-8 từ khóa tiếng Việt, phân tách bằng dấu phẩy. "
+                        "Không giải thích, không viết câu đầy đủ, không thêm tiền tố/hậu tố."
+                    ),
+                )
+            else:
+                temperature = float(os.environ.get("LLAMA_CPP_TEMPERATURE", "0.35"))
+                top_p = float(os.environ.get("LLAMA_CPP_TOP_P", "0.85"))
+                max_tokens = int(os.environ.get("LLAMA_CPP_MAX_TOKENS", "256"))
+                repeat_penalty = float(os.environ.get("LLAMA_CPP_REPEAT_PENALTY", "1.1"))
+                system_prompt = os.environ.get(
+                    "LLAMA_CPP_SYSTEM_PROMPT",
+                    (
+                        "Bạn là chuyên gia tư vấn về xử lí kỷ luật cán bộ đảng viên của "
+                        "Phòng Thanh Tra - Công An Thành Phố Huế. Trả lời ngắn gọn, chính "
+                        "xác, trích dẫn văn bản và mã điều nếu có."
+                    ),
+                )
+            response = self.llama_cpp.create_chat_completion(
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=temperature,
+                top_p=top_p,
+                max_tokens=max_tokens,
+                repeat_penalty=repeat_penalty,
+                stream=False,
+            )
+            choices = response.get("choices")
+            if not choices:
+                return None
+            content = choices[0]["message"]["content"]
+            if isinstance(content, list):
+                # llama.cpp may return list of segments
+                content = "".join(segment.get("text", "") for segment in content)
+            if isinstance(content, str):
+                return content.strip()
+            return None
+        except Exception as exc:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ llama.cpp generation error: {exc}", flush=True)
+            print(f"[LLM] ❌ Trace: {error_trace}", flush=True)
+            logger.error("llama.cpp generation error: %s\n%s", exc, error_trace)
+            return None
+    def _generate_api(self, prompt: str, context: Optional[List[Dict[str, Any]]] = None) -> Optional[str]:
+        """Generate answer by calling HF Spaces API.
+        Args:
+            prompt: Full prompt including query and documents context.
+            context: Optional conversation context (not used in API mode, handled by HF Spaces).
+        """
+        if not self.api_base_url:
+            return None
+        try:
+            import requests
+            # Prepare request payload
+            # Send the full prompt (with documents) as the message to HF Spaces
+            # This ensures HF Spaces receives all context from retrieved documents
+            payload = {
+                "message": prompt,
+                "reset_session": False
+            }
+            # Only add session_id if we have a valid session context
+            # For now, we'll omit it and let the API generate a new one
+            # Add context if available (API may support this in future)
+            # For now, context is handled by the API internally
+            # Call API endpoint
+            api_url = f"{self.api_base_url}/chatbot/chat/"
+            print(f"[LLM] 🔗 Calling API: {api_url}", flush=True)
+            print(f"[LLM] 📤 Payload: {payload}", flush=True)
+            response = requests.post(
+                api_url,
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=60
+            )
+            print(f"[LLM] 📥 Response status: {response.status_code}", flush=True)
+            print(f"[LLM] 📥 Response headers: {dict(response.headers)}", flush=True)
+            if response.status_code == 200:
+                try:
+                    result = response.json()
+                    print(f"[LLM] 📥 Response JSON: {result}", flush=True)
+                    # Extract message from response
+                    if isinstance(result, dict):
+                        message = result.get("message", None)
+                        if message:
+                            print(f"[LLM] ✅ Got message from API (length: {len(message)})", flush=True)
+                        return message
+                    else:
+                        print(f"[LLM] ⚠️ Response is not a dict: {type(result)}", flush=True)
+                        return None
+                except ValueError as e:
+                    print(f"[LLM] ❌ JSON decode error: {e}", flush=True)
+                    print(f"[LLM] ❌ Response text: {response.text[:500]}", flush=True)
+                    return None
+            elif response.status_code == 503:
+                # Service unavailable - model might be loading
+                print("[LLM] ⚠️ API service is loading, please wait...", flush=True)
+                return None
+            else:
+                print(f"[LLM] ❌ API error: {response.status_code} - {response.text[:500]}", flush=True)
+                return None
+        except requests.exceptions.Timeout:
+            print("[LLM] ❌ API request timeout")
+            return None
+        except requests.exceptions.ConnectionError as e:
+            print(f"[LLM] ❌ API connection error: {e}")
+            return None
+        except Exception as e:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ API mode error: {e}", flush=True)
+            print(f"[LLM] ❌ Full trace: {error_trace}", flush=True)
+            logger.error(f"[LLM] ❌ API mode error: {e}\n{error_trace}")
+            return None
+    def summarize_context(self, messages: List[Dict[str, Any]], max_length: int = 200) -> str:
+        """
+        Summarize conversation context.
+        Args:
+            messages: List of conversation messages.
+            max_length: Maximum summary length.
+        Returns:
+            Summary string.
+        """
+        if not messages:
+            return ""
+        # Simple summarization: extract key entities and intents
+        intents = []
+        entities = set()
+        for msg in messages:
+            if msg.get("intent"):
+                intents.append(msg["intent"])
+            if msg.get("entities"):
+                for key, value in msg["entities"].items():
+                    if isinstance(value, str):
+                        entities.add(value)
+                    elif isinstance(value, list):
+                        entities.update(value)
+        summary_parts = []
+        if intents:
+            unique_intents = list(set(intents))
+            summary_parts.append(f"Chủ đề: {', '.join(unique_intents)}")
+        if entities:
+            summary_parts.append(f"Thông tin: {', '.join(list(entities)[:5])}")
+        summary = ". ".join(summary_parts)
+        return summary[:max_length] if len(summary) > max_length else summary
+    def extract_entities_llm(self, query: str) -> Dict[str, Any]:
+        """
+        Extract entities using LLM.
+        Args:
+            query: User query.
+        Returns:
+            Dictionary of extracted entities.
+        """
+        if not self.is_available():
+            return {}
+        prompt = f"""
+        Trích xuất các thực thể từ câu hỏi sau:
+        "{query}"
+        Các loại thực thể cần tìm:
+        - fine_code: Mã vi phạm (V001, V002, ...)
+        - fine_name: Tên vi phạm
+        - procedure_name: Tên thủ tục
+        - office_name: Tên đơn vị
+        Trả lời dưới dạng JSON: {{"fine_code": "...", "fine_name": "...", ...}}
+        Nếu không có, trả về {{}}.
+        """
+        try:
+            if self.provider == LLM_PROVIDER_OPENAI:
+                response = self._generate_openai(prompt)
+            elif self.provider == LLM_PROVIDER_ANTHROPIC:
+                response = self._generate_anthropic(prompt)
+            elif self.provider == LLM_PROVIDER_OLLAMA:
+                response = self._generate_ollama(prompt)
+            elif self.provider == LLM_PROVIDER_HUGGINGFACE:
+                response = self._generate_huggingface(prompt)
+            elif self.provider == LLM_PROVIDER_LOCAL:
+                response = self._generate_local(prompt)
+            elif self.provider == LLM_PROVIDER_API:
+                # For API mode, we can't extract entities directly
+                # Return empty dict
+                return {}
+            else:
+                return {}
+            if response:
+                # Try to extract JSON from response
+                json_match = re.search(r'\{[^}]+\}', response)
+                if json_match:
+                    return json.loads(json_match.group())
+        except Exception as e:
+            print(f"Error extracting entities with LLM: {e}")
+        return {}
+# Global LLM generator instance
+_llm_generator: Optional[LLMGenerator] = None
+_last_provider: Optional[str] = None
+def get_llm_generator() -> Optional[LLMGenerator]:
+    """Get or create LLM generator instance.
+    Recreates instance only if provider changed (e.g., from local to api).
+    Model is kept alive and reused across requests.
+    """
+    global _llm_generator, _last_provider
+    # Get current provider from env
+    current_provider = os.environ.get("LLM_PROVIDER", LLM_PROVIDER).lower()
+    # Recreate only if provider changed, instance doesn't exist, or model not available
+    if _llm_generator is None or _last_provider != current_provider or not _llm_generator.is_available():
+        _llm_generator = LLMGenerator()
+        _last_provider = current_provider
+        print(f"[LLM] 🔄 Recreated LLM generator with provider: {current_provider}", flush=True)
+    else:
+        # Model already exists and provider hasn't changed - reuse it
+        print("[LLM] ♻️ Reusing existing LLM generator instance (model kept alive)", flush=True)
+        logger.debug("[LLM] Reusing existing LLM generator instance (model kept alive)")
+    return _llm_generator if _llm_generator.is_available() else None

backend/hue_portal/chatbot/llm_integration.py.backup ADDED Viewed

	@@ -0,0 +1,372 @@

+"""
+LLM integration for natural answer generation.
+Supports OpenAI GPT, Anthropic Claude, and local LLMs (Ollama).
+"""
+import os
+import re
+import json
+from typing import List, Dict, Any, Optional
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass  # dotenv is optional
+# LLM Provider types
+LLM_PROVIDER_OPENAI = "openai"
+LLM_PROVIDER_ANTHROPIC = "anthropic"
+LLM_PROVIDER_OLLAMA = "ollama"
+LLM_PROVIDER_NONE = "none"
+# Get provider from environment
+LLM_PROVIDER = os.environ.get("LLM_PROVIDER", LLM_PROVIDER_NONE).lower()
+class LLMGenerator:
+    """Generate natural language answers using LLMs."""
+    def __init__(self, provider: Optional[str] = None):
+        """
+        Initialize LLM generator.
+        Args:
+            provider: LLM provider ('openai', 'anthropic', 'ollama', or None for auto-detect).
+        """
+        self.provider = provider or LLM_PROVIDER
+        self.client = None
+        self._initialize_client()
+    def _initialize_client(self):
+        """Initialize LLM client based on provider."""
+        if self.provider == LLM_PROVIDER_OPENAI:
+            try:
+                import openai
+                api_key = os.environ.get("OPENAI_API_KEY")
+                if api_key:
+                    self.client = openai.OpenAI(api_key=api_key)
+                    print("✅ OpenAI client initialized")
+                else:
+                    print("⚠️ OPENAI_API_KEY not found, OpenAI disabled")
+            except ImportError:
+                print("⚠️ openai package not installed, install with: pip install openai")
+        elif self.provider == LLM_PROVIDER_ANTHROPIC:
+            try:
+                import anthropic
+                api_key = os.environ.get("ANTHROPIC_API_KEY")
+                if api_key:
+                    self.client = anthropic.Anthropic(api_key=api_key)
+                    print("✅ Anthropic client initialized")
+                else:
+                    print("⚠️ ANTHROPIC_API_KEY not found, Anthropic disabled")
+            except ImportError:
+                print("⚠️ anthropic package not installed, install with: pip install anthropic")
+        elif self.provider == LLM_PROVIDER_OLLAMA:
+            self.ollama_base_url = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
+            print(f"✅ Ollama configured (base_url: {self.ollama_base_url})")
+        else:
+            print("ℹ️ No LLM provider configured, using template-based generation")
+    def is_available(self) -> bool:
+        """Check if LLM is available."""
+        return self.client is not None or self.provider == LLM_PROVIDER_OLLAMA
+    def generate_answer(
+        self,
+        query: str,
+        context: Optional[List[Dict[str, Any]]] = None,
+        documents: Optional[List[Any]] = None
+    ) -> Optional[str]:
+        """
+        Generate natural language answer from documents.
+        Args:
+            query: User query.
+            context: Optional conversation context.
+            documents: Retrieved documents.
+        Returns:
+            Generated answer or None if LLM not available.
+        """
+        if not self.is_available():
+            return None
+        # Build prompt
+        prompt = self._build_prompt(query, context, documents)
+        try:
+            if self.provider == LLM_PROVIDER_OPENAI:
+                return self._generate_openai(prompt)
+            elif self.provider == LLM_PROVIDER_ANTHROPIC:
+                return self._generate_anthropic(prompt)
+            elif self.provider == LLM_PROVIDER_OLLAMA:
+                return self._generate_ollama(prompt)
+        except Exception as e:
+            print(f"Error generating answer with LLM: {e}")
+            return None
+    def _build_prompt(
+        self,
+        query: str,
+        context: Optional[List[Dict[str, Any]]],
+        documents: Optional[List[Any]]
+    ) -> str:
+        """Build prompt for LLM."""
+        prompt_parts = [
+            "Bạn là chatbot tư vấn pháp lý của Công an Thừa Thiên Huế.",
+            "Nhiệm vụ: Trả lời câu hỏi của người dùng dựa trên các văn bản pháp luật và quy định được cung cấp.",
+            "",
+            f"Câu hỏi của người dùng: {query}",
+            ""
+        ]
+        if context:
+            prompt_parts.append("Ngữ cảnh cuộc hội thoại trước đó:")
+            for msg in context[-3:]:  # Last 3 messages
+                role = "Người dùng" if msg.get("role") == "user" else "Bot"
+                content = msg.get("content", "")
+                prompt_parts.append(f"{role}: {content}")
+            prompt_parts.append("")
+        if documents:
+            prompt_parts.append("Các văn bản/quy định liên quan:")
+            for i, doc in enumerate(documents[:5], 1):
+                # Extract relevant fields based on document type
+                doc_text = self._format_document(doc)
+                prompt_parts.append(f"{i}. {doc_text}")
+            prompt_parts.append("")
+        prompt_parts.extend([
+            "Yêu cầu QUAN TRỌNG:",
+            "- CHỈ trả lời dựa trên thông tin trong 'Các văn bản/quy định liên quan' ở trên",
+            "- KHÔNG được tự tạo hoặc suy đoán thông tin không có trong tài liệu",
+            "- Nếu thông tin không đủ để trả lời, hãy nói rõ: 'Thông tin trong cơ sở dữ liệu chưa đủ để trả lời câu hỏi này'",
+            "- Nếu có mức phạt, phải ghi rõ số tiền (ví dụ: 200.000 - 400.000 VNĐ)",
+            "- Nếu có điều khoản, ghi rõ mã điều (ví dụ: Điều 5, Điều 10)",
+            "- Nếu có thủ tục, ghi rõ hồ sơ, lệ phí, thời hạn",
+            "- Trả lời bằng tiếng Việt, ngắn gọn, dễ hiểu",
+            "",
+            "Trả lời:"
+        ])
+        return "\n".join(prompt_parts)
+    def _format_document(self, doc: Any) -> str:
+        """Format document for prompt."""
+        doc_type = type(doc).__name__.lower()
+        if "fine" in doc_type:
+            parts = [f"Mức phạt: {getattr(doc, 'name', '')}"]
+            if hasattr(doc, 'code') and doc.code:
+                parts.append(f"Mã: {doc.code}")
+            if hasattr(doc, 'min_fine') and hasattr(doc, 'max_fine'):
+                if doc.min_fine and doc.max_fine:
+                    parts.append(f"Số tiền: {doc.min_fine:,.0f} - {doc.max_fine:,.0f} VNĐ")
+            return " | ".join(parts)
+        elif "procedure" in doc_type:
+            parts = [f"Thủ tục: {getattr(doc, 'title', '')}"]
+            if hasattr(doc, 'dossier') and doc.dossier:
+                parts.append(f"Hồ sơ: {doc.dossier}")
+            if hasattr(doc, 'fee') and doc.fee:
+                parts.append(f"Lệ phí: {doc.fee}")
+            return " | ".join(parts)
+        elif "office" in doc_type:
+            parts = [f"Đơn vị: {getattr(doc, 'unit_name', '')}"]
+            if hasattr(doc, 'address') and doc.address:
+                parts.append(f"Địa chỉ: {doc.address}")
+            if hasattr(doc, 'phone') and doc.phone:
+                parts.append(f"Điện thoại: {doc.phone}")
+            return " | ".join(parts)
+        elif "advisory" in doc_type:
+            parts = [f"Cảnh báo: {getattr(doc, 'title', '')}"]
+            if hasattr(doc, 'summary') and doc.summary:
+                parts.append(f"Nội dung: {doc.summary[:200]}")
+            return " | ".join(parts)
+        elif "legalsection" in doc_type or "legal" in doc_type:
+            parts = []
+            if hasattr(doc, 'section_code') and doc.section_code:
+                parts.append(f"Điều khoản: {doc.section_code}")
+            if hasattr(doc, 'section_title') and doc.section_title:
+                parts.append(f"Tiêu đề: {doc.section_title}")
+            if hasattr(doc, 'document') and doc.document:
+                doc_obj = doc.document
+                if hasattr(doc_obj, 'title'):
+                    parts.append(f"Văn bản: {doc_obj.title}")
+                if hasattr(doc_obj, 'code'):
+                    parts.append(f"Mã văn bản: {doc_obj.code}")
+            if hasattr(doc, 'content') and doc.content:
+                # Truncate content to 300 chars for prompt
+                content_short = doc.content[:300] + "..." if len(doc.content) > 300 else doc.content
+                parts.append(f"Nội dung: {content_short}")
+            return " | ".join(parts) if parts else str(doc)
+        return str(doc)
+    def _generate_openai(self, prompt: str) -> Optional[str]:
+        """Generate answer using OpenAI."""
+        if not self.client:
+            return None
+        try:
+            response = self.client.chat.completions.create(
+                model=os.environ.get("OPENAI_MODEL", "gpt-3.5-turbo"),
+                messages=[
+                    {"role": "system", "content": "Bạn là chatbot tư vấn chuyên nghiệp."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.7,
+                max_tokens=500
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            print(f"OpenAI API error: {e}")
+            return None
+    def _generate_anthropic(self, prompt: str) -> Optional[str]:
+        """Generate answer using Anthropic Claude."""
+        if not self.client:
+            return None
+        try:
+            message = self.client.messages.create(
+                model=os.environ.get("ANTHROPIC_MODEL", "claude-3-haiku-20240307"),
+                max_tokens=500,
+                messages=[
+                    {"role": "user", "content": prompt}
+                ]
+            )
+            return message.content[0].text
+        except Exception as e:
+            print(f"Anthropic API error: {e}")
+            return None
+    def _generate_ollama(self, prompt: str) -> Optional[str]:
+        """Generate answer using Ollama (local LLM)."""
+        try:
+            import requests
+            model = os.environ.get("OLLAMA_MODEL", "gemma3:1b")
+            response = requests.post(
+                f"{self.ollama_base_url}/api/generate",
+                json={
+                    "model": model,
+                    "prompt": prompt,
+                    "stream": False,
+                    "options": {
+                        "temperature": 0.7,
+                        "top_p": 0.9,
+                        "num_predict": 500
+                    }
+                },
+                timeout=60
+            )
+            if response.status_code == 200:
+                return response.json().get("response")
+            return None
+        except Exception as e:
+            print(f"Ollama API error: {e}")
+            return None
+    def summarize_context(self, messages: List[Dict[str, Any]], max_length: int = 200) -> str:
+        """
+        Summarize conversation context.
+        Args:
+            messages: List of conversation messages.
+            max_length: Maximum summary length.
+        Returns:
+            Summary string.
+        """
+        if not messages:
+            return ""
+        # Simple summarization: extract key entities and intents
+        intents = []
+        entities = set()
+        for msg in messages:
+            if msg.get("intent"):
+                intents.append(msg["intent"])
+            if msg.get("entities"):
+                for key, value in msg["entities"].items():
+                    if isinstance(value, str):
+                        entities.add(value)
+                    elif isinstance(value, list):
+                        entities.update(value)
+        summary_parts = []
+        if intents:
+            unique_intents = list(set(intents))
+            summary_parts.append(f"Chủ đề: {', '.join(unique_intents)}")
+        if entities:
+            summary_parts.append(f"Thông tin: {', '.join(list(entities)[:5])}")
+        summary = ". ".join(summary_parts)
+        return summary[:max_length] if len(summary) > max_length else summary
+    def extract_entities_llm(self, query: str) -> Dict[str, Any]:
+        """
+        Extract entities using LLM.
+        Args:
+            query: User query.
+        Returns:
+            Dictionary of extracted entities.
+        """
+        if not self.is_available():
+            return {}
+        prompt = f"""
+        Trích xuất các thực thể từ câu hỏi sau:
+        "{query}"
+        Các loại thực thể cần tìm:
+        - fine_code: Mã vi phạm (V001, V002, ...)
+        - fine_name: Tên vi phạm
+        - procedure_name: Tên thủ tục
+        - office_name: Tên đơn vị
+        Trả lời dưới dạng JSON: {{"fine_code": "...", "fine_name": "...", ...}}
+        Nếu không có, trả về {{}}.
+        """
+        try:
+            if self.provider == LLM_PROVIDER_OPENAI:
+                response = self._generate_openai(prompt)
+            elif self.provider == LLM_PROVIDER_ANTHROPIC:
+                response = self._generate_anthropic(prompt)
+            elif self.provider == LLM_PROVIDER_OLLAMA:
+                response = self._generate_ollama(prompt)
+            else:
+                return {}
+            if response:
+                # Try to extract JSON from response
+                json_match = re.search(r'\{[^}]+\}', response)
+                if json_match:
+                    return json.loads(json_match.group())
+        except Exception as e:
+            print(f"Error extracting entities with LLM: {e}")
+        return {}
+# Global LLM generator instance
+_llm_generator: Optional[LLMGenerator] = None
+def get_llm_generator() -> Optional[LLMGenerator]:
+    """Get or create LLM generator instance."""
+    global _llm_generator
+    if _llm_generator is None:
+        _llm_generator = LLMGenerator()
+    return _llm_generator if _llm_generator.is_available() else None

backend/hue_portal/chatbot/llm_integration.py.bak ADDED Viewed

	@@ -0,0 +1,877 @@

+"""
+LLM integration for natural answer generation.
+Supports OpenAI GPT, Anthropic Claude, Ollama, Hugging Face Inference API, Local Hugging Face models, and API mode.
+"""
+import os
+import re
+import json
+import sys
+import traceback
+import logging
+import time
+from typing import List, Dict, Any, Optional
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass  # dotenv is optional
+logger = logging.getLogger(__name__)
+# Import download progress tracker (optional)
+try:
+    from .download_progress import get_progress_tracker, DownloadProgress
+    PROGRESS_TRACKER_AVAILABLE = True
+except ImportError:
+    PROGRESS_TRACKER_AVAILABLE = False
+    logger.warning("Download progress tracker not available")
+# LLM Provider types
+LLM_PROVIDER_OPENAI = "openai"
+LLM_PROVIDER_ANTHROPIC = "anthropic"
+LLM_PROVIDER_OLLAMA = "ollama"
+LLM_PROVIDER_HUGGINGFACE = "huggingface"  # Hugging Face Inference API
+LLM_PROVIDER_LOCAL = "local"  # Local Hugging Face Transformers model
+LLM_PROVIDER_API = "api"  # API mode - call HF Spaces API
+LLM_PROVIDER_NONE = "none"
+# Get provider from environment (default to local Qwen if none provided)
+DEFAULT_LLM_PROVIDER = os.environ.get("DEFAULT_LLM_PROVIDER", LLM_PROVIDER_LOCAL).lower()
+env_provider = os.environ.get("LLM_PROVIDER", "").strip().lower()
+LLM_PROVIDER = env_provider or DEFAULT_LLM_PROVIDER
+class LLMGenerator:
+    """Generate natural language answers using LLMs."""
+    def __init__(self, provider: Optional[str] = None):
+        """
+        Initialize LLM generator.
+        Args:
+            provider: LLM provider ('openai', 'anthropic', 'ollama', 'local', 'huggingface', 'api', or None for auto-detect).
+        """
+        self.provider = provider or LLM_PROVIDER
+        self.client = None
+        self.local_model = None
+        self.local_tokenizer = None
+        self.api_base_url = None
+        self._initialize_client()
+    def _initialize_client(self):
+        """Initialize LLM client based on provider."""
+        if self.provider == LLM_PROVIDER_OPENAI:
+            try:
+                import openai
+                api_key = os.environ.get("OPENAI_API_KEY")
+                if api_key:
+                    self.client = openai.OpenAI(api_key=api_key)
+                    print("✅ OpenAI client initialized")
+                else:
+                    print("⚠️ OPENAI_API_KEY not found, OpenAI disabled")
+            except ImportError:
+                print("⚠️ openai package not installed, install with: pip install openai")
+        elif self.provider == LLM_PROVIDER_ANTHROPIC:
+            try:
+                import anthropic
+                api_key = os.environ.get("ANTHROPIC_API_KEY")
+                if api_key:
+                    self.client = anthropic.Anthropic(api_key=api_key)
+                    print("✅ Anthropic client initialized")
+                else:
+                    print("⚠️ ANTHROPIC_API_KEY not found, Anthropic disabled")
+            except ImportError:
+                print("⚠️ anthropic package not installed, install with: pip install anthropic")
+        elif self.provider == LLM_PROVIDER_OLLAMA:
+            self.ollama_base_url = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
+            self.ollama_model = os.environ.get("OLLAMA_MODEL", "qwen2.5:7b")
+            print(f"✅ Ollama configured (base_url: {self.ollama_base_url}, model: {self.ollama_model})")
+        elif self.provider == LLM_PROVIDER_HUGGINGFACE:
+            self.hf_api_key = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
+            self.hf_model = os.environ.get("HF_MODEL", "Qwen/Qwen2.5-7B-Instruct")
+            if self.hf_api_key:
+                print(f"✅ Hugging Face API configured (model: {self.hf_model})")
+            else:
+                print("⚠️ HF_TOKEN not found, Hugging Face may have rate limits")
+        elif self.provider == LLM_PROVIDER_API:
+            # API mode - call HF Spaces API
+            self.api_base_url = os.environ.get(
+                "HF_API_BASE_URL",
+                "https://davidtran999-hue-portal-backend.hf.space/api"
+            )
+            print(f"✅ API mode configured (base_url: {self.api_base_url})")
+        elif self.provider == LLM_PROVIDER_LOCAL:
+            self._initialize_local_model()
+        else:
+            print("ℹ️ No LLM provider configured, using template-based generation")
+    def _initialize_local_model(self):
+        """Initialize local Hugging Face Transformers model."""
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+            import torch
+            # Default to Qwen 2.5 7B with 8-bit quantization (fits in GPU RAM)
+            model_path = os.environ.get("LOCAL_MODEL_PATH", "Qwen/Qwen2.5-7B-Instruct")
+            device = os.environ.get("LOCAL_MODEL_DEVICE", "auto")  # auto, cpu, cuda
+            print(f"[LLM] Loading local model: {model_path}", flush=True)
+            logger.info(f"[LLM] Loading local model: {model_path}")
+            # Determine device
+            if device == "auto":
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            # Start cache monitoring for download progress (optional)
+            try:
+                from .cache_monitor import get_cache_monitor
+                monitor = get_cache_monitor()
+                monitor.start_monitoring(model_path, interval=2.0)
+                print(f"[LLM] 📊 Started cache monitoring for {model_path}", flush=True)
+                logger.info(f"[LLM] 📊 Started cache monitoring for {model_path}")
+            except Exception as e:
+                logger.warning(f"Could not start cache monitoring: {e}")
+            # Load tokenizer
+            print("[LLM] Loading tokenizer...", flush=True)
+            logger.info("[LLM] Loading tokenizer...")
+            try:
+                self.local_tokenizer = AutoTokenizer.from_pretrained(
+                    model_path,
+                    trust_remote_code=True
+                )
+                print("[LLM] ✅ Tokenizer loaded successfully", flush=True)
+                logger.info("[LLM] ✅ Tokenizer loaded successfully")
+            except Exception as tokenizer_err:
+                error_trace = traceback.format_exc()
+                print(f"[LLM] ❌ Tokenizer load error: {tokenizer_err}", flush=True)
+                print(f"[LLM] ❌ Tokenizer trace: {error_trace}", flush=True)
+                logger.error(f"[LLM] ❌ Tokenizer load error: {tokenizer_err}\n{error_trace}")
+                print(f"[LLM] ❌ ERROR: {type(tokenizer_err).__name__}: {str(tokenizer_err)}", file=sys.stderr, flush=True)
+                traceback.print_exc(file=sys.stderr)
+                raise
+            # Load model with optional quantization and fallback mechanism
+            print(f"[LLM] Loading model to {device}...", flush=True)
+            logger.info(f"[LLM] Loading model to {device}...")
+            # Check for quantization config
+            # Default to 8-bit for 7B (better thinking), 4-bit for larger models
+            default_8bit = "7b" in model_path.lower() or "7B" in model_path
+            default_4bit = ("32b" in model_path.lower() or "32B" in model_path or "14b" in model_path.lower() or "14B" in model_path) and not default_8bit
+            # Check environment variable for explicit quantization preference
+            quantization_pref = os.environ.get("LOCAL_MODEL_QUANTIZATION", "").lower()
+            if quantization_pref == "4bit":
+                use_8bit = False
+                use_4bit = True
+            elif quantization_pref == "8bit":
+                use_8bit = True
+                use_4bit = False
+            elif quantization_pref == "none":
+                use_8bit = False
+                use_4bit = False
+            else:
+                # Use defaults based on model size
+                use_8bit = os.environ.get("LOCAL_MODEL_8BIT", "true" if default_8bit else "false").lower() == "true"
+                use_4bit = os.environ.get("LOCAL_MODEL_4BIT", "true" if default_4bit else "false").lower() == "true"
+            # Try loading with fallback: 8-bit → 4-bit → float16
+            model_loaded = False
+            quantization_attempts = []
+            if device == "cuda":
+                # Attempt 1: Try 8-bit quantization (if requested)
+                if use_8bit:
+                    quantization_attempts.append(("8-bit", True, False))
+                # Attempt 2: Try 4-bit quantization (if 8-bit fails or not requested)
+                if use_4bit or (use_8bit and not model_loaded):
+                    quantization_attempts.append(("4-bit", False, True))
+                # Attempt 3: Fallback to float16 (no quantization)
+                quantization_attempts.append(("float16", False, False))
+            else:
+                # CPU: only float32
+                quantization_attempts.append(("float32", False, False))
+            last_error = None
+            for attempt_name, try_8bit, try_4bit in quantization_attempts:
+                if model_loaded:
+                    break
+                try:
+                    load_kwargs = {
+                        "trust_remote_code": True,
+                        "low_cpu_mem_usage": True,
+                    }
+                    if device == "cuda":
+                        load_kwargs["device_map"] = "auto"
+                        if try_4bit:
+                            from transformers import BitsAndBytesConfig
+                            load_kwargs["quantization_config"] = BitsAndBytesConfig(
+                                load_in_4bit=True,
+                                bnb_4bit_compute_dtype=torch.float16
+                            )
+                            print(f"[LLM] Attempting to load with 4-bit quantization (~4-5GB VRAM for 7B)", flush=True)
+                        elif try_8bit:
+                            from transformers import BitsAndBytesConfig
+                            # Fixed: Remove CPU offload to avoid Int8Params compatibility issue
+                            load_kwargs["quantization_config"] = BitsAndBytesConfig(
+                                load_in_8bit=True,
+                                llm_int8_threshold=6.0
+                                # Removed: llm_int8_enable_fp32_cpu_offload=True (causes compatibility issues)
+                            )
+                            # Removed: max_memory override - let accelerate handle it automatically
+                            print(f"[LLM] Attempting to load with 8-bit quantization (~7GB VRAM for 7B)", flush=True)
+                        else:
+                            load_kwargs["torch_dtype"] = torch.float16
+                            print(f"[LLM] Attempting to load with float16 (no quantization)", flush=True)
+                    else:
+                        load_kwargs["torch_dtype"] = torch.float32
+                        print(f"[LLM] Attempting to load with float32 (CPU)", flush=True)
+                    # Load model
+                    self.local_model = AutoModelForCausalLM.from_pretrained(
+                        model_path,
+                        **load_kwargs
+                    )
+                    # Stop cache monitoring (download complete)
+                    try:
+                        from .cache_monitor import get_cache_monitor
+                        monitor = get_cache_monitor()
+                        monitor.stop_monitoring(model_path)
+                        print(f"[LLM] ✅ Model download complete, stopped monitoring", flush=True)
+                    except:
+                        pass
+                    print(f"[LLM] ✅ Model loaded successfully with {attempt_name} quantization", flush=True)
+                    logger.info(f"[LLM] ✅ Model loaded successfully with {attempt_name} quantization")
+                    model_loaded = True
+                except Exception as model_load_err:
+                    last_error = model_load_err
+                    error_trace = traceback.format_exc()
+                    print(f"[LLM] ⚠️ Failed to load with {attempt_name}: {model_load_err}", flush=True)
+                    logger.warning(f"[LLM] ⚠️ Failed to load with {attempt_name}: {model_load_err}")
+                    # If this was the last attempt, raise the error
+                    if attempt_name == quantization_attempts[-1][0]:
+                        print(f"[LLM] ❌ All quantization attempts failed. Last error: {model_load_err}", flush=True)
+                        print(f"[LLM] ❌ Model load trace: {error_trace}", flush=True)
+                        logger.error(f"[LLM] ❌ Model load error: {model_load_err}\n{error_trace}")
+                        print(f"[LLM] ❌ ERROR: {type(model_load_err).__name__}: {str(model_load_err)}", file=sys.stderr, flush=True)
+                        traceback.print_exc(file=sys.stderr)
+                        raise
+                    else:
+                        # Try next quantization method
+                        print(f"[LLM] 🔄 Falling back to next quantization method...", flush=True)
+                        continue
+            if not model_loaded:
+                raise RuntimeError("Failed to load model with any quantization method")
+            if device == "cpu":
+                try:
+                    self.local_model = self.local_model.to(device)
+                    print(f"[LLM] ✅ Model moved to {device}", flush=True)
+                    logger.info(f"[LLM] ✅ Model moved to {device}")
+                except Exception as move_err:
+                    error_trace = traceback.format_exc()
+                    print(f"[LLM] ❌ Model move error: {move_err}", flush=True)
+                    logger.error(f"[LLM] ❌ Model move error: {move_err}\n{error_trace}")
+                    print(f"[LLM] ❌ ERROR: {type(move_err).__name__}: {str(move_err)}", file=sys.stderr, flush=True)
+                    traceback.print_exc(file=sys.stderr)
+            self.local_model.eval()  # Set to evaluation mode
+            print(f"[LLM] ✅ Local model loaded successfully on {device}", flush=True)
+            logger.info(f"[LLM] ✅ Local model loaded successfully on {device}")
+        except ImportError as import_err:
+            error_msg = "transformers package not installed, install with: pip install transformers torch"
+            print(f"[LLM] ⚠️ {error_msg}", flush=True)
+            logger.warning(f"[LLM] ⚠️ {error_msg}")
+            print(f"[LLM] ❌ ImportError: {import_err}", file=sys.stderr, flush=True)
+            self.local_model = None
+            self.local_tokenizer = None
+        except Exception as e:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ Error loading local model: {e}", flush=True)
+            print(f"[LLM] ❌ Full trace: {error_trace}", flush=True)
+            logger.error(f"[LLM] ❌ Error loading local model: {e}\n{error_trace}")
+            print(f"[LLM] ❌ ERROR: {type(e).__name__}: {str(e)}", file=sys.stderr, flush=True)
+            traceback.print_exc(file=sys.stderr)
+            print("[LLM] 💡 Tip: Use smaller models like Qwen/Qwen2.5-1.5B-Instruct or Qwen/Qwen2.5-0.5B-Instruct", flush=True)
+            self.local_model = None
+            self.local_tokenizer = None
+    def is_available(self) -> bool:
+        """Check if LLM is available."""
+        return (
+            self.client is not None or
+            self.provider == LLM_PROVIDER_OLLAMA or
+            self.provider == LLM_PROVIDER_HUGGINGFACE or
+            self.provider == LLM_PROVIDER_API or
+            (self.provider == LLM_PROVIDER_LOCAL and self.local_model is not None)
+        )
+    def generate_answer(
+        self,
+        query: str,
+        context: Optional[List[Dict[str, Any]]] = None,
+        documents: Optional[List[Any]] = None
+    ) -> Optional[str]:
+        """
+        Generate natural language answer from documents.
+        Args:
+            query: User query.
+            context: Optional conversation context.
+            documents: Retrieved documents.
+        Returns:
+            Generated answer or None if LLM not available.
+        """
+        if not self.is_available():
+            return None
+        # Build prompt
+        prompt = self._build_prompt(query, context, documents)
+        try:
+            print(f"[LLM] Generating answer with provider: {self.provider}", flush=True)
+            logger.info(f"[LLM] Generating answer with provider: {self.provider}")
+            if self.provider == LLM_PROVIDER_OPENAI:
+                result = self._generate_openai(prompt)
+            elif self.provider == LLM_PROVIDER_ANTHROPIC:
+                result = self._generate_anthropic(prompt)
+            elif self.provider == LLM_PROVIDER_OLLAMA:
+                result = self._generate_ollama(prompt)
+            elif self.provider == LLM_PROVIDER_HUGGINGFACE:
+                result = self._generate_huggingface(prompt)
+            elif self.provider == LLM_PROVIDER_LOCAL:
+                result = self._generate_local(prompt)
+            elif self.provider == LLM_PROVIDER_API:
+                # For API mode, send the full prompt (with documents) as the message
+                # This ensures HF Spaces receives all context from retrieved documents
+                result = self._generate_api(prompt, context)
+            else:
+                result = None
+            if result:
+                print(f"[LLM] ✅ Answer generated successfully (length: {len(result)})", flush=True)
+                logger.info(f"[LLM] ✅ Answer generated successfully (length: {len(result)})")
+            else:
+                print(f"[LLM] ⚠️ No answer generated", flush=True)
+                logger.warning("[LLM] ⚠️ No answer generated")
+            return result
+        except Exception as e:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ Error generating answer: {e}", flush=True)
+            print(f"[LLM] ❌ Full trace: {error_trace}", flush=True)
+            logger.error(f"[LLM] ❌ Error generating answer: {e}\n{error_trace}")
+            print(f"[LLM] ❌ ERROR: {type(e).__name__}: {str(e)}", file=sys.stderr, flush=True)
+            traceback.print_exc(file=sys.stderr)
+            return None
+    def _build_prompt(
+        self,
+        query: str,
+        context: Optional[List[Dict[str, Any]]],
+        documents: Optional[List[Any]]
+    ) -> str:
+        """Build prompt for LLM."""
+        prompt_parts = [
+            "Bạn là chatbot tư vấn pháp lý của Công an Thừa Thiên Huế.",
+            "Nhiệm vụ: Trả lời câu hỏi của người dùng dựa trên các văn bản pháp luật và quy định được cung cấp.",
+            "",
+            f"Câu hỏi của người dùng: {query}",
+            ""
+        ]
+        if context:
+            prompt_parts.append("Ngữ cảnh cuộc hội thoại trước đó:")
+            for msg in context[-3:]:  # Last 3 messages
+                role = "Người dùng" if msg.get("role") == "user" else "Bot"
+                content = msg.get("content", "")
+                prompt_parts.append(f"{role}: {content}")
+            prompt_parts.append("")
+        if documents:
+            prompt_parts.append("Các văn bản/quy định liên quan:")
+            for i, doc in enumerate(documents[:5], 1):
+                # Extract relevant fields based on document type
+                doc_text = self._format_document(doc)
+                prompt_parts.append(f"{i}. {doc_text}")
+            prompt_parts.append("")
+            # If documents exist, require strict adherence
+            prompt_parts.extend([
+                "Yêu cầu QUAN TRỌNG:",
+                "- CHỈ trả lời dựa trên thông tin trong 'Các văn bản/quy định liên quan' ở trên",
+                "- KHÔNG được tự tạo hoặc suy đoán thông tin không có trong tài liệu",
+                "- Nếu thông tin không đủ để trả lời, hãy nói rõ: 'Thông tin trong cơ sở dữ liệu chưa đủ để trả lời câu hỏi này'",
+                "- Nếu có mức phạt, phải ghi rõ số tiền (ví dụ: 200.000 - 400.000 VNĐ)",
+                "- Nếu có điều khoản, ghi rõ mã điều (ví dụ: Điều 5, Điều 10)",
+                "- Nếu có thủ tục, ghi rõ hồ sơ, lệ phí, thời hạn",
+                "- Trả lời bằng tiếng Việt, ngắn gọn, dễ hiểu",
+                "",
+                "Trả lời:"
+            ])
+        else:
+            # No documents - allow general conversation
+            prompt_parts.extend([
+                "Yêu cầu:",
+                "- Trả lời câu hỏi một cách tự nhiên và hữu ích như một chatbot AI thông thường",
+                "- Nếu câu hỏi liên quan đến pháp luật, thủ tục, mức phạt nhưng không có thông tin trong cơ sở dữ liệu, hãy nói: 'Tôi không tìm thấy thông tin này trong cơ sở dữ liệu. Bạn có thể liên hệ trực tiếp với Công an Thừa Thiên Huế để được tư vấn chi tiết hơn.'",
+                "- Trả lời bằng tiếng Việt, thân thiện, ngắn gọn, dễ hiểu",
+                "",
+                "Trả lời:"
+            ])
+        return "\n".join(prompt_parts)
+    def _format_document(self, doc: Any) -> str:
+        """Format document for prompt."""
+        doc_type = type(doc).__name__.lower()
+        if "fine" in doc_type:
+            parts = [f"Mức phạt: {getattr(doc, 'name', '')}"]
+            if hasattr(doc, 'code') and doc.code:
+                parts.append(f"Mã: {doc.code}")
+            if hasattr(doc, 'min_fine') and hasattr(doc, 'max_fine'):
+                if doc.min_fine and doc.max_fine:
+                    parts.append(f"Số tiền: {doc.min_fine:,.0f} - {doc.max_fine:,.0f} VNĐ")
+            return " | ".join(parts)
+        elif "procedure" in doc_type:
+            parts = [f"Thủ tục: {getattr(doc, 'title', '')}"]
+            if hasattr(doc, 'dossier') and doc.dossier:
+                parts.append(f"Hồ sơ: {doc.dossier}")
+            if hasattr(doc, 'fee') and doc.fee:
+                parts.append(f"Lệ phí: {doc.fee}")
+            return " | ".join(parts)
+        elif "office" in doc_type:
+            parts = [f"Đơn vị: {getattr(doc, 'unit_name', '')}"]
+            if hasattr(doc, 'address') and doc.address:
+                parts.append(f"Địa chỉ: {doc.address}")
+            if hasattr(doc, 'phone') and doc.phone:
+                parts.append(f"Điện thoại: {doc.phone}")
+            return " | ".join(parts)
+        elif "advisory" in doc_type:
+            parts = [f"Cảnh báo: {getattr(doc, 'title', '')}"]
+            if hasattr(doc, 'summary') and doc.summary:
+                parts.append(f"Nội dung: {doc.summary[:200]}")
+            return " | ".join(parts)
+        elif "legalsection" in doc_type or "legal" in doc_type:
+            parts = []
+            if hasattr(doc, 'section_code') and doc.section_code:
+                parts.append(f"Điều khoản: {doc.section_code}")
+            if hasattr(doc, 'section_title') and doc.section_title:
+                parts.append(f"Tiêu đề: {doc.section_title}")
+            if hasattr(doc, 'document') and doc.document:
+                doc_obj = doc.document
+                if hasattr(doc_obj, 'title'):
+                    parts.append(f"Văn bản: {doc_obj.title}")
+                if hasattr(doc_obj, 'code'):
+                    parts.append(f"Mã văn bản: {doc_obj.code}")
+            if hasattr(doc, 'content') and doc.content:
+                # Truncate content to 300 chars for prompt
+                content_short = doc.content[:300] + "..." if len(doc.content) > 300 else doc.content
+                parts.append(f"Nội dung: {content_short}")
+            return " | ".join(parts) if parts else str(doc)
+        return str(doc)
+    def _generate_openai(self, prompt: str) -> Optional[str]:
+        """Generate answer using OpenAI."""
+        if not self.client:
+            return None
+        try:
+            response = self.client.chat.completions.create(
+                model=os.environ.get("OPENAI_MODEL", "gpt-3.5-turbo"),
+                messages=[
+                    {"role": "system", "content": "Bạn là chatbot tư vấn chuyên nghiệp."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.7,
+                max_tokens=500
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            print(f"OpenAI API error: {e}")
+            return None
+    def _generate_anthropic(self, prompt: str) -> Optional[str]:
+        """Generate answer using Anthropic Claude."""
+        if not self.client:
+            return None
+        try:
+            message = self.client.messages.create(
+                model=os.environ.get("ANTHROPIC_MODEL", "claude-3-5-sonnet-20241022"),
+                max_tokens=500,
+                messages=[
+                    {"role": "user", "content": prompt}
+                ]
+            )
+            return message.content[0].text
+        except Exception as e:
+            print(f"Anthropic API error: {e}")
+            return None
+    def _generate_ollama(self, prompt: str) -> Optional[str]:
+        """Generate answer using Ollama (local LLM)."""
+        try:
+            import requests
+            model = getattr(self, 'ollama_model', os.environ.get("OLLAMA_MODEL", "qwen2.5:7b"))
+            response = requests.post(
+                f"{self.ollama_base_url}/api/generate",
+                json={
+                    "model": model,
+                    "prompt": prompt,
+                    "stream": False,
+                    "options": {
+                        "temperature": 0.7,
+                        "top_p": 0.9,
+                        "num_predict": 500
+                    }
+                },
+                timeout=60
+            )
+            if response.status_code == 200:
+                return response.json().get("response")
+            return None
+        except Exception as e:
+            print(f"Ollama API error: {e}")
+            return None
+    def _generate_huggingface(self, prompt: str) -> Optional[str]:
+        """Generate answer using Hugging Face Inference API."""
+        try:
+            import requests
+            api_url = f"https://api-inference.huggingface.co/models/{self.hf_model}"
+            headers = {}
+            if hasattr(self, 'hf_api_key') and self.hf_api_key:
+                headers["Authorization"] = f"Bearer {self.hf_api_key}"
+            response = requests.post(
+                api_url,
+                headers=headers,
+                json={
+                    "inputs": prompt,
+                    "parameters": {
+                        "temperature": 0.7,
+                        "max_new_tokens": 500,
+                        "return_full_text": False
+                    }
+                },
+                timeout=60
+            )
+            if response.status_code == 200:
+                result = response.json()
+                if isinstance(result, list) and len(result) > 0:
+                    return result[0].get("generated_text", "")
+                elif isinstance(result, dict):
+                    return result.get("generated_text", "")
+            elif response.status_code == 503:
+                # Model is loading, wait and retry
+                print("⚠️ Model is loading, please wait...")
+                return None
+            else:
+                print(f"Hugging Face API error: {response.status_code} - {response.text}")
+            return None
+        except Exception as e:
+            print(f"Hugging Face API error: {e}")
+            return None
+    def _generate_local(self, prompt: str) -> Optional[str]:
+        """Generate answer using local Hugging Face Transformers model."""
+        if self.local_model is None or self.local_tokenizer is None:
+            return None
+        try:
+            import torch
+            # Format prompt for Qwen models
+            messages = [
+                {"role": "system", "content": "Bạn là chatbot tư vấn chuyên nghiệp."},
+                {"role": "user", "content": prompt}
+            ]
+            # Apply chat template if available
+            if hasattr(self.local_tokenizer, "apply_chat_template"):
+                text = self.local_tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+            else:
+                text = prompt
+            # Tokenize
+            inputs = self.local_tokenizer(text, return_tensors="pt")
+            # Move to device
+            device = next(self.local_model.parameters()).device
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            # Generate
+            with torch.no_grad():
+                outputs = self.local_model.generate(
+                    **inputs,
+                    max_new_tokens=500,
+                    temperature=0.7,
+                    top_p=0.9,
+                    do_sample=True,
+                    pad_token_id=self.local_tokenizer.eos_token_id
+                )
+            # Decode
+            generated_text = self.local_tokenizer.decode(
+                outputs[0][inputs["input_ids"].shape[1]:],
+                skip_special_tokens=True
+            )
+            return generated_text.strip()
+        except TypeError as e:
+            # Check for Int8Params compatibility error
+            if "_is_hf_initialized" in str(e) or "Int8Params" in str(e):
+                error_msg = (
+                    f"[LLM] ❌ Int8Params compatibility error: {e}\n"
+                    f"[LLM] 💡 This error occurs when using 8-bit quantization with incompatible library versions.\n"
+                    f"[LLM] 💡 Solutions:\n"
+                    f"[LLM]   1. Set LOCAL_MODEL_QUANTIZATION=4bit to use 4-bit quantization instead\n"
+                    f"[LLM]   2. Set LOCAL_MODEL_QUANTIZATION=none to disable quantization\n"
+                    f"[LLM]   3. Use API mode (LLM_PROVIDER=api) to avoid local model issues\n"
+                    f"[LLM]   4. Use a smaller model like Qwen/Qwen2.5-1.5B-Instruct"
+                )
+                print(error_msg, flush=True)
+                logger.error(f"[LLM] ❌ Int8Params compatibility error: {e}")
+                print(f"[LLM] ❌ ERROR: {type(e).__name__}: {str(e)}", file=sys.stderr, flush=True)
+                return None
+            else:
+                # Other TypeError, re-raise to be caught by general handler
+                raise
+        except Exception as e:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ Local model generation error: {e}", flush=True)
+            print(f"[LLM] ❌ Full trace: {error_trace}", flush=True)
+            logger.error(f"[LLM] ❌ Local model generation error: {e}\n{error_trace}")
+            print(f"[LLM] ❌ ERROR: {type(e).__name__}: {str(e)}", file=sys.stderr, flush=True)
+            traceback.print_exc(file=sys.stderr)
+            return None
+    def _generate_api(self, prompt: str, context: Optional[List[Dict[str, Any]]] = None) -> Optional[str]:
+        """Generate answer by calling HF Spaces API.
+        Args:
+            prompt: Full prompt including query and documents context.
+            context: Optional conversation context (not used in API mode, handled by HF Spaces).
+        """
+        if not self.api_base_url:
+            return None
+        try:
+            import requests
+            # Prepare request payload
+            # Send the full prompt (with documents) as the message to HF Spaces
+            # This ensures HF Spaces receives all context from retrieved documents
+            payload = {
+                "message": prompt,
+                "reset_session": False
+            }
+            # Only add session_id if we have a valid session context
+            # For now, we'll omit it and let the API generate a new one
+            # Add context if available (API may support this in future)
+            # For now, context is handled by the API internally
+            # Call API endpoint
+            api_url = f"{self.api_base_url}/chatbot/chat/"
+            print(f"[LLM] 🔗 Calling API: {api_url}", flush=True)
+            print(f"[LLM] 📤 Payload: {payload}", flush=True)
+            response = requests.post(
+                api_url,
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=60
+            )
+            print(f"[LLM] 📥 Response status: {response.status_code}", flush=True)
+            print(f"[LLM] 📥 Response headers: {dict(response.headers)}", flush=True)
+            if response.status_code == 200:
+                try:
+                    result = response.json()
+                    print(f"[LLM] 📥 Response JSON: {result}", flush=True)
+                    # Extract message from response
+                    if isinstance(result, dict):
+                        message = result.get("message", None)
+                        if message:
+                            print(f"[LLM] ✅ Got message from API (length: {len(message)})", flush=True)
+                        return message
+                    else:
+                        print(f"[LLM] ⚠️ Response is not a dict: {type(result)}", flush=True)
+                        return None
+                except ValueError as e:
+                    print(f"[LLM] ❌ JSON decode error: {e}", flush=True)
+                    print(f"[LLM] ❌ Response text: {response.text[:500]}", flush=True)
+                    return None
+            elif response.status_code == 503:
+                # Service unavailable - model might be loading
+                print("[LLM] ⚠️ API service is loading, please wait...", flush=True)
+                return None
+            else:
+                print(f"[LLM] ❌ API error: {response.status_code} - {response.text[:500]}", flush=True)
+                return None
+        except requests.exceptions.Timeout:
+            print("[LLM] ❌ API request timeout")
+            return None
+        except requests.exceptions.ConnectionError as e:
+            print(f"[LLM] ❌ API connection error: {e}")
+            return None
+        except Exception as e:
+            error_trace = traceback.format_exc()
+            print(f"[LLM] ❌ API mode error: {e}", flush=True)
+            print(f"[LLM] ❌ Full trace: {error_trace}", flush=True)
+            logger.error(f"[LLM] ❌ API mode error: {e}\n{error_trace}")
+            return None
+    def summarize_context(self, messages: List[Dict[str, Any]], max_length: int = 200) -> str:
+        """
+        Summarize conversation context.
+        Args:
+            messages: List of conversation messages.
+            max_length: Maximum summary length.
+        Returns:
+            Summary string.
+        """
+        if not messages:
+            return ""
+        # Simple summarization: extract key entities and intents
+        intents = []
+        entities = set()
+        for msg in messages:
+            if msg.get("intent"):
+                intents.append(msg["intent"])
+            if msg.get("entities"):
+                for key, value in msg["entities"].items():
+                    if isinstance(value, str):
+                        entities.add(value)
+                    elif isinstance(value, list):
+                        entities.update(value)
+        summary_parts = []
+        if intents:
+            unique_intents = list(set(intents))
+            summary_parts.append(f"Chủ đề: {', '.join(unique_intents)}")
+        if entities:
+            summary_parts.append(f"Thông tin: {', '.join(list(entities)[:5])}")
+        summary = ". ".join(summary_parts)
+        return summary[:max_length] if len(summary) > max_length else summary
+    def extract_entities_llm(self, query: str) -> Dict[str, Any]:
+        """
+        Extract entities using LLM.
+        Args:
+            query: User query.
+        Returns:
+            Dictionary of extracted entities.
+        """
+        if not self.is_available():
+            return {}
+        prompt = f"""
+        Trích xuất các thực thể từ câu hỏi sau:
+        "{query}"
+        Các loại thực thể cần tìm:
+        - fine_code: Mã vi phạm (V001, V002, ...)
+        - fine_name: Tên vi phạm
+        - procedure_name: Tên thủ tục
+        - office_name: Tên đơn vị
+        Trả lời dưới dạng JSON: {{"fine_code": "...", "fine_name": "...", ...}}
+        Nếu không có, trả về {{}}.
+        """
+        try:
+            if self.provider == LLM_PROVIDER_OPENAI:
+                response = self._generate_openai(prompt)
+            elif self.provider == LLM_PROVIDER_ANTHROPIC:
+                response = self._generate_anthropic(prompt)
+            elif self.provider == LLM_PROVIDER_OLLAMA:
+                response = self._generate_ollama(prompt)
+            elif self.provider == LLM_PROVIDER_HUGGINGFACE:
+                response = self._generate_huggingface(prompt)
+            elif self.provider == LLM_PROVIDER_LOCAL:
+                response = self._generate_local(prompt)
+            elif self.provider == LLM_PROVIDER_API:
+                # For API mode, we can't extract entities directly
+                # Return empty dict
+                return {}
+            else:
+                return {}
+            if response:
+                # Try to extract JSON from response
+                json_match = re.search(r'\{[^}]+\}', response)
+                if json_match:
+                    return json.loads(json_match.group())
+        except Exception as e:
+            print(f"Error extracting entities with LLM: {e}")
+        return {}
+# Global LLM generator instance
+_llm_generator: Optional[LLMGenerator] = None
+_last_provider: Optional[str] = None
+def get_llm_generator() -> Optional[LLMGenerator]:
+    """Get or create LLM generator instance.
+    Recreates instance if provider changed (e.g., from local to api).
+    """
+    global _llm_generator, _last_provider
+    # Get current provider from env
+    current_provider = os.environ.get("LLM_PROVIDER", LLM_PROVIDER_NONE).lower()
+    # Recreate if provider changed or instance doesn't exist
+    if _llm_generator is None or _last_provider != current_provider:
+        _llm_generator = LLMGenerator()
+        _last_provider = current_provider
+        print(f"[LLM] 🔄 Recreated LLM generator with provider: {current_provider}", flush=True)
+    return _llm_generator if _llm_generator.is_available() else None

backend/hue_portal/chatbot/query_expansion.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Query expansion and paraphrasing utilities for improving search recall.
+"""
+import re
+import unicodedata
+from typing import List, Dict, Any, Optional, Set
+from hue_portal.core.models import Synonym
+from hue_portal.core.search_ml import expand_query_with_synonyms
+def normalize_vietnamese_query(query: str) -> str:
+    """
+    Normalize Vietnamese text by handling diacritics variants.
+    Args:
+        query: Input query string.
+    Returns:
+        Normalized query string.
+    """
+    if not query:
+        return ""
+    # Remove extra spaces
+    query = re.sub(r'\s+', ' ', query.strip())
+    # Lowercase
+    query = query.lower()
+    return query
+def extract_key_phrases(query: str) -> List[str]:
+    """
+    Extract key phrases from query.
+    Args:
+        query: Input query string.
+    Returns:
+        List of key phrases.
+    """
+    if not query:
+        return []
+    # Remove common stopwords
+    stopwords = {
+        "là", "gì", "bao nhiêu", "như thế nào", "ở đâu", "của", "và", "hoặc",
+        "tôi", "bạn", "có", "không", "được", "một", "các", "với", "cho"
+    }
+    # Split into words
+    words = re.findall(r'\b\w+\b', query.lower())
+    # Filter stopwords and short words
+    key_words = [w for w in words if w not in stopwords and len(w) > 2]
+    # Extract bigrams (2-word phrases)
+    phrases = []
+    for i in range(len(key_words) - 1):
+        phrase = f"{key_words[i]} {key_words[i+1]}"
+        phrases.append(phrase)
+    # Combine single words and phrases
+    all_phrases = key_words + phrases
+    return all_phrases
+def expand_query_semantically(query: str, context: Optional[Dict[str, Any]] = None) -> List[str]:
+    """
+    Expand query with synonyms and related terms.
+    Args:
+        query: Original query string.
+        context: Optional context dictionary with entities, intents, etc.
+    Returns:
+        List of expanded query variations.
+    """
+    expanded = [query]
+    # Use existing synonym expansion
+    synonym_expanded = expand_query_with_synonyms(query)
+    expanded.extend(synonym_expanded)
+    # Add context-based expansions
+    if context:
+        entities = context.get("entities", {})
+        # If fine_code in context, add fine name variations
+        if "fine_code" in entities:
+            fine_code = entities["fine_code"]
+            # Could look up fine name from database and add variations
+            expanded.append(f"{query} {fine_code}")
+        # If procedure_name in context, add procedure variations
+        if "procedure_name" in entities:
+            procedure_name = entities["procedure_name"]
+            expanded.append(f"{query} {procedure_name}")
+    # Add common Vietnamese variations
+    variations = _get_vietnamese_variations(query)
+    expanded.extend(variations)
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_expanded = []
+    for q in expanded:
+        q_normalized = normalize_vietnamese_query(q)
+        if q_normalized not in seen:
+            seen.add(q_normalized)
+            unique_expanded.append(q)
+    return unique_expanded
+def _get_vietnamese_variations(query: str) -> List[str]:
+    """
+    Get common Vietnamese query variations.
+    Args:
+        query: Input query.
+    Returns:
+        List of variations.
+    """
+    variations = []
+    query_lower = query.lower()
+    # Common synonym mappings
+    synonym_map = {
+        "mức phạt": ["tiền phạt", "phạt", "xử phạt"],
+        "thủ tục": ["hồ sơ", "giấy tờ", "quy trình"],
+        "địa chỉ": ["nơi", "chỗ", "điểm"],
+        "số điện thoại": ["điện thoại", "số liên hệ", "hotline"],
+        "giờ làm việc": ["thời gian", "giờ", "lịch làm việc"],
+        "cảnh báo": ["thông báo", "lưu ý", "chú ý"],
+        "lừa đảo": ["scam", "gian lận", "lừa"],
+    }
+    for key, synonyms in synonym_map.items():
+        if key in query_lower:
+            for synonym in synonyms:
+                variation = query_lower.replace(key, synonym)
+                if variation != query_lower:
+                    variations.append(variation)
+    return variations
+def paraphrase_query(query: str) -> List[str]:
+    """
+    Generate paraphrases of the query to increase recall.
+    Args:
+        query: Original query string.
+    Returns:
+        List of paraphrased queries.
+    """
+    paraphrases = [query]
+    query_lower = query.lower()
+    # Common paraphrasing patterns for Vietnamese
+    patterns = [
+        # Question variations
+        (r"mức phạt (.+) là bao nhiêu", r"phạt \1 bao nhiêu tiền"),
+        (r"thủ tục (.+) cần gì", r"làm thủ tục \1 cần giấy tờ gì"),
+        (r"địa chỉ (.+) ở đâu", r"\1 ở đâu"),
+        (r"(.+) như thế nào", r"cách \1"),
+    ]
+    for pattern, replacement in patterns:
+        if re.search(pattern, query_lower):
+            paraphrase = re.sub(pattern, replacement, query_lower)
+            if paraphrase != query_lower:
+                paraphrases.append(paraphrase)
+    # Add question word variations
+    if "bao nhiêu" in query_lower:
+        paraphrases.append(query_lower.replace("bao nhiêu", "mức"))
+        paraphrases.append(query_lower.replace("bao nhiêu", "giá"))
+    if "như thế nào" in query_lower:
+        paraphrases.append(query_lower.replace("như thế nào", "cách"))
+        paraphrases.append(query_lower.replace("như thế nào", "quy trình"))
+    # Remove duplicates
+    return list(dict.fromkeys(paraphrases))
+def enhance_query_with_context(query: str, context: Optional[Dict[str, Any]] = None) -> str:
+    """
+    Enhance query with context information.
+    Args:
+        query: Original query string.
+        context: Optional context dictionary.
+    Returns:
+        Enhanced query string.
+    """
+    if not context:
+        return query
+    enhanced_parts = [query]
+    # Add entities from context
+    entities = context.get("entities", {})
+    if "fine_code" in entities:
+        enhanced_parts.append(entities["fine_code"])
+    if "procedure_name" in entities:
+        enhanced_parts.append(entities["procedure_name"])
+    if "office_name" in entities:
+        enhanced_parts.append(entities["office_name"])
+    # Add intent-based keywords
+    intent = context.get("intent", "")
+    if intent == "search_fine":
+        enhanced_parts.append("mức phạt vi phạm")
+    elif intent == "search_procedure":
+        enhanced_parts.append("thủ tục hành chính")
+    elif intent == "search_office":
+        enhanced_parts.append("đơn vị công an")
+    return " ".join(enhanced_parts)

backend/hue_portal/chatbot/router.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+Routing utilities that decide whether a query should hit RAG or stay in small-talk.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, Optional
+class IntentRoute(str, Enum):
+    """High-level route for the chatbot pipeline."""
+    GREETING = "greeting"
+    SMALL_TALK = "small_talk"
+    SEARCH = "search"
+DOCUMENT_CODE_PATTERNS = [
+    r"264[-\s]?QD[-\s]?TW",
+    r"QD[-\s]?69[-\s]?TW",
+    r"TT[-\s]?02[-\s]?CAND",
+    r"TT[-\s]?02[-\s]?BIEN[-\s]?SOAN",
+    r"QUYET[-\s]?DINH[-\s]?69",
+    r"QUYET[-\s]?DINH[-\s]?264",
+    r"THONG[-\s]?TU[-\s]?02",
+]
+SMALL_TALK_PHRASES = [
+    "mệt quá",
+    "nhàm chán",
+    "tâm sự",
+    "chém gió",
+    "đang làm gì",
+    "chuyện trò",
+    "trò chuyện",
+    "hỏi chơi thôi",
+]
+def _has_document_code(query: str) -> bool:
+    normalized = query.upper()
+    return any(re.search(pattern, normalized) for pattern in DOCUMENT_CODE_PATTERNS)
+def _flag_keywords(query_lower: str) -> Dict[str, bool]:
+    return {
+        "greeting": any(
+            phrase in query_lower for phrase in ["xin chào", "xin chao", "chào", "chao", "hello", "hi"]
+        ),
+        "fine": any(
+            kw in query_lower
+            for kw in ["mức phạt", "phạt", "vi phạm", "đèn đỏ", "nồng độ cồn", "mũ bảo hiểm", "tốc độ"]
+        ),
+        "procedure": any(
+            kw in query_lower for kw in ["thủ tục", "thu tuc", "hồ sơ", "ho so", "điều kiện", "dieu kien", "cư trú", "cu tru"]
+        ),
+        "advisory": any(kw in query_lower for kw in ["cảnh báo", "lua dao", "lừa đảo", "scam", "mạo danh", "thủ đoạn"]),
+        "office": any(kw in query_lower for kw in ["địa chỉ", "dia chi", "công an", "cong an", "điểm tiếp dân", "số điện thoại"]),
+        "legal": any(
+            kw in query_lower
+            for kw in [
+                "quyết định",
+                "quyet dinh",
+                "thông tư",
+                "thong tu",
+                "nghị quyết",
+                "nghi quyet",
+                "nghị định",
+                "nghi dinh",
+                "luật",
+                "luat",
+                "điều ",
+                "dieu ",
+                "kỷ luật",
+                "qd 69",
+                "qd 264",
+                "thông tư 02",
+                "điều lệnh",
+                "văn bản pháp luật",
+            ]
+        ),
+        "small_talk": any(phrase in query_lower for phrase in SMALL_TALK_PHRASES),
+    }
+@dataclass
+class RouteDecision:
+    route: IntentRoute
+    intent: str
+    confidence: float
+    rationale: str
+    forced_intent: Optional[str] = None
+    keyword_flags: Dict[str, bool] = field(default_factory=dict)
+def decide_route(query: str, intent: str, confidence: float) -> RouteDecision:
+    """
+    Decide how the chatbot should handle the query before invoking RAG.
+    """
+    query_lower = query.lower().strip()
+    words = query_lower.split()
+    keyword_flags = _flag_keywords(query_lower)
+    has_doc_code = _has_document_code(query_lower)
+    route = IntentRoute.SEARCH
+    rationale = "default-search"
+    forced_intent: Optional[str] = None
+    doc_code_override = False
+    if has_doc_code and intent != "search_legal":
+        forced_intent = "search_legal"
+        rationale = "doc-code-detected"
+        route = IntentRoute.SEARCH
+        doc_code_override = True
+    greeting_candidate = (
+        len(words) <= 3 and keyword_flags["greeting"] and not any(
+            keyword_flags[key] for key in ["fine", "procedure", "advisory", "office", "legal"]
+        )
+    )
+    if greeting_candidate and intent == "greeting" and not doc_code_override:
+        route = IntentRoute.GREETING
+        rationale = "simple-greeting"
+        forced_intent = "greeting"
+    elif (
+        not doc_code_override
+        and keyword_flags["small_talk"]
+        and not any(keyword_flags[key] for key in ["fine", "procedure", "advisory", "office", "legal"])
+    ):
+        route = IntentRoute.SMALL_TALK
+        rationale = "small-talk-keywords"
+        forced_intent = "general_query"
+    elif not doc_code_override and (intent == "general_query" or confidence < 0.55):
+        # Generic small talk / low confidence
+        route = IntentRoute.SMALL_TALK
+        rationale = "general-or-low-confidence"
+    if route != IntentRoute.GREETING and not doc_code_override:
+        keyword_force_map = [
+            ("legal", "search_legal"),
+            ("fine", "search_fine"),
+            ("procedure", "search_procedure"),
+            ("advisory", "search_advisory"),
+            ("office", "search_office"),
+        ]
+        for flag, target_intent in keyword_force_map:
+            if forced_intent:
+                break
+            if keyword_flags.get(flag) and intent != target_intent:
+                forced_intent = target_intent
+                route = IntentRoute.SEARCH
+                rationale = f"keyword-override-{flag}"
+                break
+    return RouteDecision(
+        route=route,
+        intent=intent,
+        confidence=confidence,
+        rationale=rationale,
+        forced_intent=forced_intent,
+        keyword_flags=keyword_flags,
+    )

backend/hue_portal/chatbot/schemas/legal_answer.rail ADDED Viewed

	@@ -0,0 +1,63 @@

+<rail version="0.2">
+<output>
+    <object name="LegalAnswer">
+        <string name="summary" format="no_apology vietnamese_legal_summary" />
+        <list name="details" min_length="2">
+            <string format="vietnamese_bullet_with_citation" />
+        </list>
+        <list name="citations" min_length="1">
+            <object>
+                <string name="document_title" />
+                <string name="section_code" />
+                <string name="page_range" required="false" />
+                <string name="summary" format="short_summary" />
+                <string name="snippet" />
+            </object>
+        </list>
+    </object>
+</output>
+<prompt>
+Bạn là chuyên gia tư vấn về xử lí kỷ luật cán bộ đảng viên của Phòng Thanh Tra - Công An Thành Phố Huế. Tổng hợp câu trả lời dựa trên các trích đoạn đã cung cấp.
+Yêu cầu bắt buộc:
+- Tất cả nội dung phải bằng tiếng Việt trang trọng, không xin lỗi hay né tránh.
+- Phần summary phải nhắc rõ tên văn bản chính (ví dụ: Quyết định 69/QĐ-TW) và nêu kết luận 1-2 câu.
+- Mỗi phần tử trong DETAILS là một bullet mô tả hình thức xử lý hoặc điều khoản, phải ghi rõ Điều/Khoản hoặc chương tương ứng.
+- DETAILS phải ghi đúng tên văn bản có trong dữ liệu (ví dụ: Quyết định 69/QĐ-TW, Thông tư 02/CAND) và không bịa ra điều khoản khác.
+- CITATIONS phải chứa ít nhất một mục, mỗi mục nêu rõ văn bản, điều khoản, trang và trích đoạn ≤500 ký tự.
+- Nếu thiếu thông tin, ghi rõ trong summary nhưng vẫn tuân thủ định dạng.
+$context
+</prompt>
+<output_format>
+{{output}}
+</output_format>
+<instructions>
+<list name="no_apology">
+    <string>Không chứa cụm xin lỗi (ví dụ: “xin lỗi”, “rất tiếc”).</string>
+    <string>Bắt buộc nhắc tên văn bản pháp luật.</string>
+</list>
+<list name="vietnamese_legal_summary">
+    <string>Viết tiếng Việt trang trọng, tối đa 2 câu.</string>
+    <string>Nhắc tên văn bản áp dụng.</string>
+</list>
+<list name="vietnamese_bullet_with_citation">
+    <string>Mỗi bullet bắt đầu bằng dấu “- ”.</string>
+    <string>Có cụm “Điều” hoặc “Khoản”.</string>
+    <string>Phải chứa tên văn bản pháp luật (ví dụ: “Quyết định 69/QĐ-TW”).</string>
+    <string>Chỉ sử dụng điều/khoản xuất hiện trong dữ liệu; nếu không rõ ghi “(không nêu điều cụ thể)”.</string>
+    <string>Không dùng tiếng Anh hoặc tiếng Trung.</string>
+    <string>Không phát minh hình thức kỷ luật hoặc điều luật mới.</string>
+</list>
+<list name="short_summary">
+    <string>Tối đa 2 câu.</string>
+</list>
+</instructions>
+</rail>

backend/hue_portal/chatbot/slow_path_handler.py ADDED Viewed

	@@ -0,0 +1,1392 @@

+"""
+Slow Path Handler - Full RAG pipeline for complex queries.
+"""
+import os
+import time
+import logging
+import hashlib
+from typing import Dict, Any, Optional, List, Set
+import unicodedata
+import re
+from concurrent.futures import ThreadPoolExecutor, Future
+import threading
+from hue_portal.core.chatbot import get_chatbot, RESPONSE_TEMPLATES
+from hue_portal.core.models import (
+    Fine,
+    Procedure,
+    Office,
+    Advisory,
+    LegalSection,
+    LegalDocument,
+)
+from hue_portal.core.search_ml import search_with_ml
+from hue_portal.core.pure_semantic_search import pure_semantic_search
+# Lazy import reranker to avoid blocking startup (FlagEmbedding may download model)
+# from hue_portal.core.reranker import rerank_documents
+from hue_portal.chatbot.llm_integration import get_llm_generator
+from hue_portal.chatbot.structured_legal import format_structured_legal_answer
+from hue_portal.chatbot.context_manager import ConversationContext
+from hue_portal.chatbot.router import DOCUMENT_CODE_PATTERNS
+from hue_portal.core.query_rewriter import get_query_rewriter
+from hue_portal.core.pure_semantic_search import pure_semantic_search, parallel_vector_search
+from hue_portal.core.redis_cache import get_redis_cache
+logger = logging.getLogger(__name__)
+class SlowPathHandler:
+    """Handle Slow Path queries with full RAG pipeline."""
+    def __init__(self):
+        self.chatbot = get_chatbot()
+        self.llm_generator = get_llm_generator()
+        # Thread pool for parallel search (max 2 workers to avoid overwhelming DB)
+        self._executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="parallel_search")
+        # Cache for prefetched results by session_id (in-memory fallback)
+        self._prefetched_cache: Dict[str, Dict[str, Any]] = {}
+        self._cache_lock = threading.Lock()
+        # Redis cache for prefetch results
+        self.redis_cache = get_redis_cache()
+        # Prefetch cache TTL (30 minutes default)
+        self.prefetch_cache_ttl = int(os.environ.get("CACHE_PREFETCH_TTL", "1800"))
+        # Toggle wizard flow (disable to answer directly)
+        self.disable_wizard_flow = os.environ.get("DISABLE_WIZARD_FLOW", "false").lower() == "true"
+    def handle(
+        self,
+        query: str,
+        intent: str,
+        session_id: Optional[str] = None,
+        selected_document_code: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """
+        Full RAG pipeline:
+        1. Search (hybrid: BM25 + vector)
+        2. Retrieve top 20 documents
+        3. LLM generation with structured output (for legal queries)
+        4. Guardrails validation
+        5. Retry up to 3 times if needed
+        Args:
+            query: User query.
+            intent: Detected intent.
+            session_id: Optional session ID for context.
+            selected_document_code: Selected document code from wizard.
+        Returns:
+            Response dict with message, intent, results, etc.
+        """
+        query = query.strip()
+        selected_document_code_normalized = (
+            selected_document_code.strip().upper() if selected_document_code else None
+        )
+        # Handle greetings
+        if intent == "greeting":
+            query_lower = query.lower().strip()
+            query_words = query_lower.split()
+            is_simple_greeting = (
+                len(query_words) <= 3 and
+                any(greeting in query_lower for greeting in ["xin chào", "chào", "hello", "hi"]) and
+                not any(kw in query_lower for kw in ["phạt", "mức phạt", "vi phạm", "thủ tục", "hồ sơ", "địa chỉ", "công an", "cảnh báo"])
+            )
+            if is_simple_greeting:
+                return {
+                    "message": RESPONSE_TEMPLATES["greeting"],
+                    "intent": "greeting",
+                    "results": [],
+                    "count": 0,
+                    "_source": "slow_path"
+                }
+        # Wizard / option-first cho mọi câu hỏi pháp lý chung:
+        # Nếu:
+        #   - intent là search_legal
+        #   - chưa có selected_document_code trong session
+        #   - trong câu hỏi không ghi rõ mã văn bản
+        # Thì: luôn trả về payload options để người dùng chọn văn bản trước,
+        # chưa generate câu trả lời chi tiết.
+        has_explicit_code = self._has_explicit_document_code_in_query(query)
+        logger.info(
+            "[WIZARD] Checking wizard conditions - intent=%s, selected_code=%s, has_explicit_code=%s, query='%s'",
+            intent,
+            selected_document_code_normalized,
+            has_explicit_code,
+            query[:50],
+        )
+        if (
+            intent == "search_legal"
+            and not self.disable_wizard_flow
+            and not selected_document_code_normalized
+            and not has_explicit_code
+        ):
+            logger.info("[QUERY_REWRITE] ✅ Wizard conditions met, using Query Rewrite Strategy")
+            # Query Rewrite Strategy: Rewrite query into 3-5 optimized legal queries
+            query_rewriter = get_query_rewriter(self.llm_generator)
+            # Get conversation context for query rewriting
+            context = None
+            if session_id:
+                try:
+                    recent_messages = ConversationContext.get_recent_messages(session_id, limit=5)
+                    context = [
+                        {"role": msg.role, "content": msg.content}
+                        for msg in recent_messages
+                    ]
+                except Exception as exc:
+                    logger.warning("[QUERY_REWRITE] Failed to load context: %s", exc)
+            # Rewrite query into 3-5 queries
+            rewritten_queries = query_rewriter.rewrite_query(
+                query,
+                context=context,
+                max_queries=5,
+                min_queries=3
+            )
+            if not rewritten_queries:
+                # Fallback to original query if rewrite fails
+                rewritten_queries = [query]
+            logger.info(
+                "[QUERY_REWRITE] Rewrote query into %d queries: %s",
+                len(rewritten_queries),
+                rewritten_queries[:3]
+            )
+            # Parallel vector search with multiple queries
+            try:
+                from hue_portal.core.models import LegalSection
+                # Search all legal sections (no document filter yet)
+                qs = LegalSection.objects.all()
+                text_fields = ["section_title", "section_code", "content"]
+                # Use parallel vector search
+                search_results = parallel_vector_search(
+                    rewritten_queries,
+                    qs,
+                    top_k_per_query=5,
+                    final_top_k=7,
+                    text_fields=text_fields
+                )
+                # Extract unique document codes from results
+                doc_codes_seen: Set[str] = set()
+                document_options: List[Dict[str, Any]] = []
+                for section, score in search_results:
+                    doc = getattr(section, "document", None)
+                    if not doc:
+                        continue
+                    doc_code = getattr(doc, "code", "").upper()
+                    if not doc_code or doc_code in doc_codes_seen:
+                        continue
+                    doc_codes_seen.add(doc_code)
+                    # Get document metadata
+                    doc_title = getattr(doc, "title", "") or doc_code
+                    doc_summary = getattr(doc, "summary", "") or ""
+                    if not doc_summary:
+                        metadata = getattr(doc, "metadata", {}) or {}
+                        if isinstance(metadata, dict):
+                            doc_summary = metadata.get("summary", "")
+                    document_options.append({
+                        "code": doc_code,
+                        "title": doc_title,
+                        "summary": doc_summary,
+                        "score": float(score),
+                        "doc_type": getattr(doc, "doc_type", "") or "",
+                    })
+                    # Limit to top 5 documents
+                    if len(document_options) >= 5:
+                        break
+                # If no documents found, use canonical fallback
+                if not document_options:
+                    logger.warning("[QUERY_REWRITE] No documents found, using canonical fallback")
+                    canonical_candidates = [
+                        {
+                            "code": "264-QD-TW",
+                            "title": "Quyết định 264-QĐ/TW về kỷ luật đảng viên",
+                            "summary": "",
+                            "doc_type": "",
+                        },
+                        {
+                            "code": "QD-69-TW",
+                            "title": "Quy định 69-QĐ/TW về kỷ luật tổ chức đảng, đảng viên",
+                            "summary": "",
+                            "doc_type": "",
+                        },
+                        {
+                            "code": "TT-02-CAND",
+                            "title": "Thông tư 02/2021/TT-BCA về điều lệnh CAND",
+                            "summary": "",
+                            "doc_type": "",
+                        },
+                    ]
+                    clarification_payload = self._build_clarification_payload(
+                        query, canonical_candidates
+                    )
+                    if clarification_payload:
+                        clarification_payload.setdefault("intent", intent)
+                        clarification_payload.setdefault("_source", "clarification")
+                        clarification_payload.setdefault("routing", "clarification")
+                        clarification_payload.setdefault("confidence", 0.3)
+                        return clarification_payload
+                # Build options from search results
+                options = [
+                    {
+                        "code": opt["code"],
+                        "title": opt["title"],
+                        "reason": opt.get("summary") or f"Độ liên quan: {opt['score']:.2f}",
+                    }
+                    for opt in document_options
+                ]
+                # Add "Khác" option
+                if not any(opt.get("code") == "__other__" for opt in options):
+                    options.append({
+                        "code": "__other__",
+                        "title": "Khác",
+                        "reason": "Tôi muốn hỏi văn bản hoặc chủ đề pháp luật khác.",
+                    })
+                message = (
+                    "Tôi đã tìm thấy các văn bản pháp luật liên quan đến câu hỏi của bạn.\n\n"
+                    "Bạn hãy chọn văn bản muốn tra cứu để tôi trả lời chi tiết hơn:"
+                )
+                logger.info(
+                    "[QUERY_REWRITE] ✅ Found %d documents using Query Rewrite Strategy",
+                    len(document_options)
+                )
+                return {
+                    "type": "options",
+                    "wizard_stage": "choose_document",
+                    "message": message,
+                    "options": options,
+                    "clarification": {
+                        "message": message,
+                        "options": options,
+                    },
+                    "results": [],
+                    "count": 0,
+                    "intent": intent,
+                    "_source": "query_rewrite",
+                    "routing": "query_rewrite",
+                    "confidence": 0.95,  # High confidence with Query Rewrite Strategy
+                }
+            except Exception as exc:
+                logger.error(
+                    "[QUERY_REWRITE] Error in Query Rewrite Strategy: %s, falling back to LLM suggestions",
+                    exc,
+                    exc_info=True
+                )
+                # Fallback to original LLM-based clarification
+                canonical_candidates: List[Dict[str, Any]] = []
+                try:
+                    canonical_docs = list(
+                        LegalDocument.objects.filter(
+                            code__in=["264-QD-TW", "QD-69-TW", "TT-02-CAND"]
+                        )
+                    )
+                    for doc in canonical_docs:
+                        summary = getattr(doc, "summary", "") or ""
+                        metadata = getattr(doc, "metadata", {}) or {}
+                        if not summary and isinstance(metadata, dict):
+                            summary = metadata.get("summary", "")
+                        canonical_candidates.append(
+                            {
+                                "code": doc.code,
+                                "title": getattr(doc, "title", "") or doc.code,
+                                "summary": summary,
+                                "doc_type": getattr(doc, "doc_type", "") or "",
+                                "section_title": "",
+                            }
+                        )
+                except Exception as e:
+                    logger.warning("[CLARIFICATION] Canonical documents lookup failed: %s", e)
+                if not canonical_candidates:
+                    canonical_candidates = [
+                        {
+                            "code": "264-QD-TW",
+                            "title": "Quyết định 264-QĐ/TW về kỷ luật đảng viên",
+                            "summary": "",
+                            "doc_type": "",
+                            "section_title": "",
+                        },
+                        {
+                            "code": "QD-69-TW",
+                            "title": "Quy định 69-QĐ/TW về kỷ luật tổ chức đảng, đảng viên",
+                            "summary": "",
+                            "doc_type": "",
+                            "section_title": "",
+                        },
+                        {
+                            "code": "TT-02-CAND",
+                            "title": "Thông tư 02/2021/TT-BCA về điều lệnh CAND",
+                            "summary": "",
+                            "doc_type": "",
+                            "section_title": "",
+                        },
+                    ]
+                clarification_payload = self._build_clarification_payload(
+                    query, canonical_candidates
+                )
+                if clarification_payload:
+                    clarification_payload.setdefault("intent", intent)
+                    clarification_payload.setdefault("_source", "clarification_fallback")
+                    clarification_payload.setdefault("routing", "clarification")
+                    clarification_payload.setdefault("confidence", 0.3)
+                    return clarification_payload
+        # Search based on intent - retrieve top-15 for reranking (balance speed and RAM)
+        search_result = self._search_by_intent(
+            intent,
+            query,
+            limit=15,
+            preferred_document_code=selected_document_code_normalized,
+        )  # Balance: 15 for good recall, not too slow
+        # Fast path for high-confidence legal queries (skip for complex queries)
+        fast_path_response = None
+        if intent == "search_legal" and not self._is_complex_query(query):
+            fast_path_response = self._maybe_fast_path_response(search_result["results"], query)
+            if fast_path_response:
+                fast_path_response["intent"] = intent
+                fast_path_response["_source"] = "fast_path"
+                return fast_path_response
+        # Rerank results - DISABLED for speed (can enable via ENABLE_RERANKER env var)
+        # Reranker adds 1-3 seconds delay, skip for faster responses
+        enable_reranker = os.environ.get("ENABLE_RERANKER", "false").lower() == "true"
+        if intent == "search_legal" and enable_reranker:
+            try:
+                # Lazy import to avoid blocking startup (FlagEmbedding may download model)
+                from hue_portal.core.reranker import rerank_documents
+                legal_results = [r for r in search_result["results"] if r.get("type") == "legal"]
+                if len(legal_results) > 0:
+                    # Rerank to top-4 (balance speed and context quality)
+                    top_k = min(4, len(legal_results))
+                    reranked = rerank_documents(query, legal_results, top_k=top_k)
+                    # Update search_result with reranked results (keep non-legal results)
+                    non_legal = [r for r in search_result["results"] if r.get("type") != "legal"]
+                    search_result["results"] = reranked + non_legal
+                    search_result["count"] = len(search_result["results"])
+                    logger.info(
+                        "[RERANKER] Reranked %d legal results to top-%d for query: %s",
+                        len(legal_results),
+                        top_k,
+                        query[:50]
+                    )
+            except Exception as e:
+                logger.warning("[RERANKER] Reranking failed: %s, using original results", e)
+        elif intent == "search_legal":
+            # Skip reranking for speed - just use top results by score
+            logger.debug("[RERANKER] Skipped reranking for speed (ENABLE_RERANKER=false)")
+        # BƯỚC 1: Bypass LLM khi có results tốt (tránh context overflow + tăng tốc 30-40%)
+        # Chỉ áp dụng cho legal queries có results với score cao
+        if intent == "search_legal" and search_result["count"] > 0:
+            top_result = search_result["results"][0]
+            top_score = top_result.get("score", 0.0) or 0.0
+            top_data = top_result.get("data", {})
+            doc_code = (top_data.get("document_code") or "").upper()
+            content = top_data.get("content", "") or top_data.get("excerpt", "")
+            # Bypass LLM nếu:
+            # 1. Có document code (TT-02-CAND, etc.) và content đủ dài
+            # 2. Score >= 0.4 (giảm threshold để dễ trigger hơn)
+            # 3. Hoặc có keywords quan trọng (%, hạ bậc, thi đua, tỷ lệ) với score >= 0.3
+            should_bypass = False
+            query_lower = query.lower()
+            has_keywords = any(kw in query_lower for kw in ["%", "phần trăm", "tỷ lệ", "12%", "20%", "10%", "hạ bậc", "thi đua", "xếp loại", "vi phạm", "cán bộ"])
+            # Điều kiện bypass dễ hơn: có doc_code + content đủ dài + score hợp lý
+            if doc_code and len(content) > 100:
+                if top_score >= 0.4:
+                    should_bypass = True
+                elif has_keywords and top_score >= 0.3:
+                    should_bypass = True
+            # Hoặc có keywords quan trọng + content đủ dài
+            elif has_keywords and len(content) > 100 and top_score >= 0.3:
+                should_bypass = True
+            if should_bypass:
+                # Template trả thẳng cho query về tỷ lệ vi phạm + hạ bậc thi đua
+                if any(kw in query_lower for kw in ["12%", "tỷ lệ", "phần trăm", "hạ bậc", "thi đua"]):
+                    # Query về tỷ lệ vi phạm và hạ bậc thi đua
+                    section_code = top_data.get("section_code", "")
+                    section_title = top_data.get("section_title", "")
+                    doc_title = top_data.get("document_title", "văn bản pháp luật")
+                    # Trích xuất đoạn liên quan từ content
+                    content_preview = content[:600] + "..." if len(content) > 600 else content
+                    answer = (
+                        f"Theo {doc_title} ({doc_code}):\n\n"
+                        f"{section_code}: {section_title}\n\n"
+                        f"{content_preview}\n\n"
+                        f"Nguồn: {section_code}, {doc_title} ({doc_code})"
+                    )
+                else:
+                    # Template chung cho legal queries
+                    section_code = top_data.get("section_code", "Điều liên quan")
+                    section_title = top_data.get("section_title", "")
+                    doc_title = top_data.get("document_title", "văn bản pháp luật")
+                    content_preview = content[:500] + "..." if len(content) > 500 else content
+                    answer = (
+                        f"Kết quả chính xác nhất:\n\n"
+                        f"- Văn bản: {doc_title} ({doc_code})\n"
+                        f"- Điều khoản: {section_code}" + (f" – {section_title}" if section_title else "") + "\n\n"
+                        f"{content_preview}\n\n"
+                        f"Nguồn: {section_code}, {doc_title} ({doc_code})"
+                    )
+                logger.info(
+                    "[BYPASS_LLM] Using raw template for legal query (score=%.3f, doc=%s, query='%s')",
+                    top_score,
+                    doc_code,
+                    query[:50]
+                )
+                return {
+                    "message": answer,
+                    "intent": intent,
+                    "confidence": min(0.99, top_score + 0.05),
+                    "results": search_result["results"][:3],
+                    "count": min(3, search_result["count"]),
+                    "_source": "raw_template",
+                    "routing": "raw_template"
+                }
+        # Get conversation context if available
+        context = None
+        context_summary = ""
+        if session_id:
+            try:
+                recent_messages = ConversationContext.get_recent_messages(session_id, limit=5)
+                context = [
+                    {
+                        "role": msg.role,
+                        "content": msg.content,
+                        "intent": msg.intent
+                    }
+                    for msg in recent_messages
+                ]
+                # Tạo context summary để đưa vào prompt nếu có conversation history
+                if len(context) > 1:
+                    context_parts = []
+                    for msg in reversed(context[-3:]):  # Chỉ lấy 3 message gần nhất
+                        if msg["role"] == "user":
+                            context_parts.append(f"Người dùng: {msg['content'][:100]}")
+                        elif msg["role"] == "bot":
+                            context_parts.append(f"Bot: {msg['content'][:100]}")
+                    if context_parts:
+                        context_summary = "\n\nNgữ cảnh cuộc trò chuyện trước đó:\n" + "\n".join(context_parts)
+            except Exception as exc:
+                logger.warning("[CONTEXT] Failed to load conversation context: %s", exc)
+        # Enhance query with context if available
+        enhanced_query = query
+        if context_summary:
+            enhanced_query = query + context_summary
+        # Generate response message using LLM if available and we have documents
+        message = None
+        if self.llm_generator and search_result["count"] > 0:
+            # For legal queries, use structured output (top-4 for good context and speed)
+            if intent == "search_legal" and search_result["results"]:
+                legal_docs = [r["data"] for r in search_result["results"] if r.get("type") == "legal"][:4]  # Top-4 for balance
+                if legal_docs:
+                    structured_answer = self.llm_generator.generate_structured_legal_answer(
+                        enhanced_query,  # Dùng enhanced_query có context
+                        legal_docs,
+                        prefill_summary=None
+                    )
+                    if structured_answer:
+                        message = format_structured_legal_answer(structured_answer)
+            # For other intents or if structured failed, use regular LLM generation
+            if not message:
+                documents = [r["data"] for r in search_result["results"][:4]]  # Top-4 for balance
+                message = self.llm_generator.generate_answer(
+                    enhanced_query,  # Dùng enhanced_query có context
+                    context=context,
+                    documents=documents
+                )
+        # Fallback to template if LLM not available or failed
+        if not message:
+            if search_result["count"] > 0:
+                # Đặc biệt xử lý legal queries: format tốt hơn thay vì dùng template chung
+                if intent == "search_legal" and search_result["results"]:
+                    top_result = search_result["results"][0]
+                    top_data = top_result.get("data", {})
+                    doc_code = top_data.get("document_code", "")
+                    doc_title = top_data.get("document_title", "văn bản pháp luật")
+                    section_code = top_data.get("section_code", "")
+                    section_title = top_data.get("section_title", "")
+                    content = top_data.get("content", "") or top_data.get("excerpt", "")
+                    if content and len(content) > 50:
+                        content_preview = content[:400] + "..." if len(content) > 400 else content
+                        message = (
+                            f"Tôi tìm thấy {search_result['count']} điều khoản liên quan đến '{query}':\n\n"
+                            f"**{section_code}**: {section_title or 'Nội dung liên quan'}\n\n"
+                            f"{content_preview}\n\n"
+                            f"Nguồn: {doc_title}" + (f" ({doc_code})" if doc_code else "")
+                        )
+                    else:
+                        template = RESPONSE_TEMPLATES.get(intent, RESPONSE_TEMPLATES["general_query"])
+                        message = template.format(
+                            count=search_result["count"],
+                            query=query
+                        )
+                else:
+                    template = RESPONSE_TEMPLATES.get(intent, RESPONSE_TEMPLATES["general_query"])
+                    message = template.format(
+                        count=search_result["count"],
+                        query=query
+                    )
+            else:
+                message = RESPONSE_TEMPLATES["no_results"].format(query=query)
+        # Limit results to top 5 for response
+        results = search_result["results"][:5]
+        response = {
+            "message": message,
+            "intent": intent,
+            "confidence": 0.95,  # High confidence for Slow Path (thorough search)
+            "results": results,
+            "count": len(results),
+            "_source": "slow_path"
+        }
+        return response
+    def _maybe_request_clarification(
+        self,
+        query: str,
+        search_result: Dict[str, Any],
+        selected_document_code: Optional[str] = None,
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Quyết định có nên hỏi người dùng chọn văn bản (wizard step: choose_document).
+        Nguyên tắc option-first:
+        - Nếu user CHƯA chọn văn bản trong session
+        - Và trong câu hỏi KHÔNG ghi rõ mã văn bản
+        - Và search có trả về kết quả
+        => Ưu tiên trả về danh sách văn bản để người dùng chọn, thay vì trả lời thẳng.
+        """
+        if selected_document_code:
+            return None
+        if not search_result or search_result.get("count", 0) == 0:
+            return None
+        # Nếu người dùng đã ghi rõ mã văn bản trong câu hỏi (ví dụ: 264/QĐ-TW)
+        # thì không cần hỏi lại – ưu tiên dùng chính mã đó.
+        if self._has_explicit_document_code_in_query(query):
+            return None
+        # Ưu tiên dùng danh sách văn bản "chuẩn" (canonical) nếu có trong DB.
+        # Tuy nhiên, để đảm bảo wizard luôn hoạt động (option-first),
+        # nếu DB chưa đủ dữ liệu thì vẫn build danh sách tĩnh fallback.
+        fallback_candidates: List[Dict[str, Any]] = []
+        try:
+            fallback_docs = list(
+                LegalDocument.objects.filter(
+                    code__in=["264-QD-TW", "QD-69-TW", "TT-02-CAND"]
+                )
+            )
+            for doc in fallback_docs:
+                summary = getattr(doc, "summary", "") or ""
+                metadata = getattr(doc, "metadata", {}) or {}
+                if not summary and isinstance(metadata, dict):
+                    summary = metadata.get("summary", "")
+                fallback_candidates.append(
+                    {
+                        "code": doc.code,
+                        "title": getattr(doc, "title", "") or doc.code,
+                        "summary": summary,
+                        "doc_type": getattr(doc, "doc_type", "") or "",
+                        "section_title": "",
+                    }
+                )
+        except Exception as exc:
+            logger.warning(
+                "[CLARIFICATION] Fallback documents lookup failed, using static list: %s",
+                exc,
+            )
+        # Nếu DB chưa có đủ thông tin, luôn cung cấp danh sách tĩnh tối thiểu,
+        # để wizard option-first vẫn hoạt động.
+        if not fallback_candidates:
+            fallback_candidates = [
+                {
+                    "code": "264-QD-TW",
+                    "title": "Quyết định 264-QĐ/TW về kỷ luật đảng viên",
+                    "summary": "",
+                    "doc_type": "",
+                    "section_title": "",
+                },
+                {
+                    "code": "QD-69-TW",
+                    "title": "Quy định 69-QĐ/TW về kỷ luật tổ chức đảng, đảng viên",
+                    "summary": "",
+                    "doc_type": "",
+                    "section_title": "",
+                },
+                {
+                    "code": "TT-02-CAND",
+                    "title": "Thông tư 02/2021/TT-BCA về điều lệnh CAND",
+                    "summary": "",
+                    "doc_type": "",
+                    "section_title": "",
+                },
+            ]
+        payload = self._build_clarification_payload(query, fallback_candidates)
+        if payload:
+            logger.info(
+                "[CLARIFICATION] Requesting user choice among canonical documents: %s",
+                [c["code"] for c in fallback_candidates],
+            )
+        return payload
+    def _has_explicit_document_code_in_query(self, query: str) -> bool:
+        """
+        Check if the raw query string explicitly contains a known document code
+        pattern (e.g. '264/QĐ-TW', 'QD-69-TW', 'TT-02-CAND').
+        Khác với _detect_document_code (dò toàn bộ bảng LegalDocument theo token),
+        hàm này chỉ dựa trên các regex cố định để tránh over-detect cho câu hỏi
+        chung chung như 'xử lí kỷ luật đảng viên thế nào'.
+        """
+        normalized = self._remove_accents(query).upper()
+        if not normalized:
+            return False
+        for pattern in DOCUMENT_CODE_PATTERNS:
+            try:
+                if re.search(pattern, normalized):
+                    return True
+            except re.error:
+                # Nếu pattern không hợp lệ thì bỏ qua, không chặn flow
+                continue
+        return False
+    def _collect_document_candidates(
+        self,
+        legal_results: List[Dict[str, Any]],
+        limit: int = 4,
+    ) -> List[Dict[str, Any]]:
+        """Collect unique document candidates from legal results."""
+        ordered_codes: List[str] = []
+        seen: set[str] = set()
+        for result in legal_results:
+            data = result.get("data", {})
+            code = (data.get("document_code") or "").strip()
+            if not code:
+                continue
+            upper = code.upper()
+            if upper in seen:
+                continue
+            ordered_codes.append(code)
+            seen.add(upper)
+            if len(ordered_codes) >= limit:
+                break
+        if len(ordered_codes) < 2:
+            return []
+        try:
+            documents = {
+                doc.code.upper(): doc
+                for doc in LegalDocument.objects.filter(code__in=ordered_codes)
+            }
+        except Exception as exc:
+            logger.warning("[CLARIFICATION] Unable to load documents for candidates: %s", exc)
+            documents = {}
+        candidates: List[Dict[str, Any]] = []
+        for code in ordered_codes:
+            upper = code.upper()
+            doc_obj = documents.get(upper)
+            section = next(
+                (
+                    res
+                    for res in legal_results
+                    if (res.get("data", {}).get("document_code") or "").strip().upper() == upper
+                ),
+                None,
+            )
+            data = section.get("data", {}) if section else {}
+            summary = ""
+            if doc_obj:
+                summary = doc_obj.summary or ""
+                if not summary and isinstance(doc_obj.metadata, dict):
+                    summary = doc_obj.metadata.get("summary", "")
+            if not summary:
+                summary = data.get("excerpt") or data.get("content", "")[:200]
+            candidates.append(
+                {
+                    "code": code,
+                    "title": data.get("document_title") or (doc_obj.title if doc_obj else code),
+                    "summary": summary,
+                    "doc_type": doc_obj.doc_type if doc_obj else "",
+                    "section_title": data.get("section_title") or "",
+                }
+            )
+        return candidates
+    def _build_clarification_payload(
+        self,
+        query: str,
+        candidates: List[Dict[str, Any]],
+    ) -> Optional[Dict[str, Any]]:
+        if not candidates:
+            return None
+        default_message = (
+            "Tôi tìm thấy một số văn bản có thể phù hợp. "
+            "Bạn vui lòng chọn văn bản muốn tra cứu để tôi trả lời chính xác hơn."
+        )
+        llm_payload = self._call_clarification_llm(query, candidates)
+        message = default_message
+        options: List[Dict[str, Any]] = []
+        # Ưu tiên dùng gợi ý từ LLM, nhưng phải luôn đảm bảo có options fallback
+        if llm_payload:
+            message = llm_payload.get("message") or default_message
+            raw_options = llm_payload.get("options")
+            if isinstance(raw_options, list):
+                options = [
+                    {
+                        "code": (opt.get("code") or candidate.get("code", "")).upper(),
+                        "title": opt.get("title") or opt.get("document_title") or candidate.get("title", ""),
+                        "reason": opt.get("reason")
+                        or opt.get("summary")
+                        or candidate.get("summary")
+                        or candidate.get("section_title")
+                        or "",
+                    }
+                    for opt, candidate in zip(
+                        raw_options,
+                        candidates[: len(raw_options)],
+                    )
+                    if (opt.get("code") or candidate.get("code"))
+                    and (opt.get("title") or opt.get("document_title") or candidate.get("title"))
+                ]
+        # Nếu LLM không trả về options hợp lệ → fallback build từ candidates
+        if not options:
+            options = [
+                {
+                    "code": candidate["code"].upper(),
+                    "title": candidate["title"],
+                    "reason": candidate.get("summary") or candidate.get("section_title") or "",
+                }
+                for candidate in candidates[:3]
+            ]
+        if not any(opt.get("code") == "__other__" for opt in options):
+            options.append(
+                {
+                    "code": "__other__",
+                    "title": "Khác",
+                    "reason": "Tôi muốn hỏi văn bản hoặc chủ đề khác",
+                }
+            )
+        return {
+            # Wizard-style payload: ưu tiên dạng options cho UI
+            "type": "options",
+            "wizard_stage": "choose_document",
+            "message": message,
+            "options": options,
+            "clarification": {
+                "message": message,
+                "options": options,
+            },
+            "results": [],
+            "count": 0,
+        }
+    def _call_clarification_llm(
+        self,
+        query: str,
+        candidates: List[Dict[str, Any]],
+    ) -> Optional[Dict[str, Any]]:
+        if not self.llm_generator:
+            return None
+        try:
+            return self.llm_generator.suggest_clarification_topics(
+                query,
+                candidates,
+                max_options=3,
+            )
+        except Exception as exc:
+            logger.warning("[CLARIFICATION] LLM suggestion failed: %s", exc)
+            return None
+    def _parallel_search_prepare(
+        self,
+        document_code: str,
+        keywords: List[str],
+        session_id: Optional[str] = None,
+    ) -> None:
+        """
+        Trigger parallel search in background when user selects a document option.
+        Stores results in cache for Stage 2 (choose topic).
+        Args:
+            document_code: Selected document code
+            keywords: Keywords extracted from query/options
+            session_id: Session ID for caching results
+        """
+        if not session_id:
+            return
+        def _search_task():
+            try:
+                logger.info(
+                    "[PARALLEL_SEARCH] Starting background search for doc=%s, keywords=%s",
+                    document_code,
+                    keywords[:5],
+                )
+                # Check Redis cache first
+                cache_key = f"prefetch:{document_code.upper()}:{hashlib.sha256(' '.join(keywords).encode()).hexdigest()[:16]}"
+                cached_result = None
+                if self.redis_cache and self.redis_cache.is_available():
+                    cached_result = self.redis_cache.get(cache_key)
+                    if cached_result:
+                        logger.info(
+                            "[PARALLEL_SEARCH] ✅ Cache hit for doc=%s",
+                            document_code
+                        )
+                        # Store in in-memory cache too
+                        with self._cache_lock:
+                            if session_id not in self._prefetched_cache:
+                                self._prefetched_cache[session_id] = {}
+                            self._prefetched_cache[session_id]["document_results"] = cached_result
+                        return
+                # Search in the selected document
+                query_text = " ".join(keywords) if keywords else ""
+                search_result = self._search_by_intent(
+                    intent="search_legal",
+                    query=query_text,
+                    limit=20,  # Get more results for topic options
+                    preferred_document_code=document_code.upper(),
+                )
+                # Prepare cache data
+                cache_data = {
+                    "document_code": document_code,
+                    "results": search_result.get("results", []),
+                    "count": search_result.get("count", 0),
+                    "timestamp": time.time(),
+                }
+                # Store in Redis cache
+                if self.redis_cache and self.redis_cache.is_available():
+                    self.redis_cache.set(cache_key, cache_data, ttl_seconds=self.prefetch_cache_ttl)
+                    logger.debug(
+                        "[PARALLEL_SEARCH] Cached prefetch results (TTL: %ds)",
+                        self.prefetch_cache_ttl
+                    )
+                # Store in in-memory cache (fallback)
+                with self._cache_lock:
+                    if session_id not in self._prefetched_cache:
+                        self._prefetched_cache[session_id] = {}
+                    self._prefetched_cache[session_id]["document_results"] = cache_data
+                logger.info(
+                    "[PARALLEL_SEARCH] Completed background search for doc=%s, found %d results",
+                    document_code,
+                    search_result.get("count", 0),
+                )
+            except Exception as exc:
+                logger.warning("[PARALLEL_SEARCH] Background search failed: %s", exc)
+        # Submit to thread pool
+        self._executor.submit(_search_task)
+    def _parallel_search_topic(
+        self,
+        document_code: str,
+        topic_keywords: List[str],
+        session_id: Optional[str] = None,
+    ) -> None:
+        """
+        Trigger parallel search when user selects a topic option.
+        Stores results for final answer generation.
+        Args:
+            document_code: Selected document code
+            topic_keywords: Keywords from selected topic
+            session_id: Session ID for caching results
+        """
+        if not session_id:
+            return
+        def _search_task():
+            try:
+                logger.info(
+                    "[PARALLEL_SEARCH] Starting topic search for doc=%s, keywords=%s",
+                    document_code,
+                    topic_keywords[:5],
+                )
+                # Search with topic keywords
+                query_text = " ".join(topic_keywords) if topic_keywords else ""
+                search_result = self._search_by_intent(
+                    intent="search_legal",
+                    query=query_text,
+                    limit=10,
+                    preferred_document_code=document_code.upper(),
+                )
+                # Store in cache
+                with self._cache_lock:
+                    if session_id not in self._prefetched_cache:
+                        self._prefetched_cache[session_id] = {}
+                    self._prefetched_cache[session_id]["topic_results"] = {
+                        "document_code": document_code,
+                        "keywords": topic_keywords,
+                        "results": search_result.get("results", []),
+                        "count": search_result.get("count", 0),
+                        "timestamp": time.time(),
+                    }
+                logger.info(
+                    "[PARALLEL_SEARCH] Completed topic search, found %d results",
+                    search_result.get("count", 0),
+                )
+            except Exception as exc:
+                logger.warning("[PARALLEL_SEARCH] Topic search failed: %s", exc)
+        # Submit to thread pool
+        self._executor.submit(_search_task)
+    def _get_prefetched_results(
+        self,
+        session_id: Optional[str],
+        result_type: str = "document_results",
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Get prefetched search results from cache.
+        Args:
+            session_id: Session ID
+            result_type: "document_results" or "topic_results"
+        Returns:
+            Cached results dict or None
+        """
+        if not session_id:
+            return None
+        with self._cache_lock:
+            cache_entry = self._prefetched_cache.get(session_id)
+            if not cache_entry:
+                return None
+            results = cache_entry.get(result_type)
+            if not results:
+                return None
+            # Check if results are still fresh (within 5 minutes)
+            timestamp = results.get("timestamp", 0)
+            if time.time() - timestamp > 300:  # 5 minutes
+                logger.debug("[PARALLEL_SEARCH] Prefetched results expired for session=%s", session_id)
+                return None
+            return results
+    def _clear_prefetched_cache(self, session_id: Optional[str]) -> None:
+        """Clear prefetched cache for a session."""
+        if not session_id:
+            return
+        with self._cache_lock:
+            if session_id in self._prefetched_cache:
+                del self._prefetched_cache[session_id]
+                logger.debug("[PARALLEL_SEARCH] Cleared cache for session=%s", session_id)
+    def _search_by_intent(
+        self,
+        intent: str,
+        query: str,
+        limit: int = 5,
+        preferred_document_code: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Search based on classified intent. Reduced limit from 20 to 5 for faster inference on free tier."""
+        # Use original query for better matching
+        keywords = query.strip()
+        extracted = " ".join(self.chatbot.extract_keywords(query))
+        if extracted and len(extracted) > 2:
+            keywords = f"{keywords} {extracted}"
+        results = []
+        if intent == "search_fine":
+            qs = Fine.objects.all()
+            text_fields = ["name", "code", "article", "decree", "remedial"]
+            search_results = search_with_ml(qs, keywords, text_fields, top_k=limit, min_score=0.1)
+            results = [{"type": "fine", "data": {
+                "id": f.id,
+                "name": f.name,
+                "code": f.code,
+                "min_fine": float(f.min_fine) if f.min_fine else None,
+                "max_fine": float(f.max_fine) if f.max_fine else None,
+                "article": f.article,
+                "decree": f.decree,
+            }} for f in search_results]
+        elif intent == "search_procedure":
+            qs = Procedure.objects.all()
+            text_fields = ["title", "domain", "conditions", "dossier"]
+            search_results = search_with_ml(qs, keywords, text_fields, top_k=limit, min_score=0.1)
+            results = [{"type": "procedure", "data": {
+                "id": p.id,
+                "title": p.title,
+                "domain": p.domain,
+                "level": p.level,
+            }} for p in search_results]
+        elif intent == "search_office":
+            qs = Office.objects.all()
+            text_fields = ["unit_name", "address", "district", "service_scope"]
+            search_results = search_with_ml(qs, keywords, text_fields, top_k=limit, min_score=0.1)
+            results = [{"type": "office", "data": {
+                "id": o.id,
+                "unit_name": o.unit_name,
+                "address": o.address,
+                "district": o.district,
+                "phone": o.phone,
+                "working_hours": o.working_hours,
+            }} for o in search_results]
+        elif intent == "search_advisory":
+            qs = Advisory.objects.all()
+            text_fields = ["title", "summary"]
+            search_results = search_with_ml(qs, keywords, text_fields, top_k=limit, min_score=0.1)
+            results = [{"type": "advisory", "data": {
+                "id": a.id,
+                "title": a.title,
+                "summary": a.summary,
+            }} for a in search_results]
+        elif intent == "search_legal":
+            qs = LegalSection.objects.all()
+            text_fields = ["section_title", "section_code", "content"]
+            detected_code = self._detect_document_code(query)
+            effective_code = preferred_document_code or detected_code
+            filtered = False
+            if effective_code:
+                filtered_qs = qs.filter(document__code__iexact=effective_code)
+                if filtered_qs.exists():
+                    qs = filtered_qs
+                    filtered = True
+                    logger.info(
+                        "[SEARCH] Prefiltering legal sections for document code %s (query='%s')",
+                        effective_code,
+                        query,
+                    )
+                else:
+                    logger.info(
+                        "[SEARCH] Document code %s detected but no sections found locally, falling back to full corpus",
+                        effective_code,
+                    )
+            else:
+                logger.debug("[SEARCH] No document code detected for query: %s", query)
+            # Use pure semantic search (100% vector, no BM25)
+            search_results = pure_semantic_search(
+                [keywords],
+                qs,
+                top_k=limit,  # limit=15 for reranking, will be reduced to 4
+                text_fields=text_fields
+            )
+            results = self._format_legal_results(search_results, detected_code, query=query)
+            logger.info(
+                "[SEARCH] Legal intent processed (query='%s', code=%s, filtered=%s, results=%d)",
+                query,
+                detected_code or "None",
+                filtered,
+                len(results),
+            )
+        return {
+            "intent": intent,
+            "query": query,
+            "keywords": keywords,
+            "results": results,
+            "count": len(results),
+            "detected_code": detected_code,
+        }
+    def _should_save_to_golden(self, query: str, response: Dict) -> bool:
+        """
+        Decide if response should be saved to golden dataset.
+        Criteria:
+        - High confidence (>0.95)
+        - Has results
+        - Response is complete and well-formed
+        - Not already in golden dataset
+        """
+        try:
+            from hue_portal.core.models import GoldenQuery
+            # Check if already exists
+            query_normalized = self._normalize_query(query)
+            if GoldenQuery.objects.filter(query_normalized=query_normalized, is_active=True).exists():
+                return False
+            # Check criteria
+            has_results = response.get("count", 0) > 0
+            has_message = bool(response.get("message", "").strip())
+            confidence = response.get("confidence", 0.0)
+            # Only save if high quality
+            if has_results and has_message and confidence >= 0.95:
+                # Additional check: message should be substantial (not just template)
+                message = response.get("message", "")
+                if len(message) > 50:  # Substantial response
+                    return True
+            return False
+        except Exception as e:
+            logger.warning(f"Error checking if should save to golden: {e}")
+            return False
+    def _normalize_query(self, query: str) -> str:
+        """Normalize query for matching."""
+        normalized = query.lower().strip()
+        # Remove accents
+        normalized = unicodedata.normalize("NFD", normalized)
+        normalized = "".join(ch for ch in normalized if unicodedata.category(ch) != "Mn")
+        # Remove extra spaces
+        normalized = re.sub(r'\s+', ' ', normalized).strip()
+        return normalized
+    def _detect_document_code(self, query: str) -> Optional[str]:
+        """Detect known document code mentioned in the query."""
+        normalized_query = self._remove_accents(query).upper()
+        if not normalized_query:
+            return None
+        try:
+            codes = LegalDocument.objects.values_list("code", flat=True)
+        except Exception as exc:
+            logger.debug("Unable to fetch document codes: %s", exc)
+            return None
+        for code in codes:
+            if not code:
+                continue
+            tokens = self._split_code_tokens(code)
+            if tokens and all(token in normalized_query for token in tokens):
+                logger.info("[SEARCH] Detected document code %s in query", code)
+                return code
+        return None
+    def _split_code_tokens(self, code: str) -> List[str]:
+        """Split a document code into uppercase accentless tokens."""
+        normalized = self._remove_accents(code).upper()
+        return [tok for tok in re.split(r"[-/\s]+", normalized) if tok]
+    def _remove_accents(self, text: str) -> str:
+        if not text:
+            return ""
+        normalized = unicodedata.normalize("NFD", text)
+        return "".join(ch for ch in normalized if unicodedata.category(ch) != "Mn")
+    def _format_legal_results(
+        self,
+        search_results: List[Any],
+        detected_code: Optional[str],
+        query: Optional[str] = None,
+    ) -> List[Dict[str, Any]]:
+        """Build legal result payload and apply ordering/boosting based on doc code and keywords."""
+        entries: List[Dict[str, Any]] = []
+        upper_detected = detected_code.upper() if detected_code else None
+        # Keywords that indicate important legal concepts (boost score if found)
+        important_keywords = []
+        if query:
+            query_lower = query.lower()
+            # Keywords for percentage/threshold queries
+            if any(kw in query_lower for kw in ["%", "phần trăm", "tỷ lệ", "12%", "20%", "10%"]):
+                important_keywords.extend(["%", "phần trăm", "tỷ lệ", "12", "20", "10"])
+            # Keywords for ranking/demotion queries
+            if any(kw in query_lower for kw in ["hạ bậc", "thi đua", "xếp loại", "đánh giá"]):
+                important_keywords.extend(["hạ bậc", "thi đua", "xếp loại", "đánh giá"])
+        for ls in search_results:
+            doc = ls.document
+            doc_code = doc.code if doc else None
+            score = getattr(ls, "_ml_score", getattr(ls, "rank", 0.0)) or 0.0
+            # Boost score if content contains important keywords
+            content_text = (ls.content or ls.section_title or "").lower()
+            keyword_boost = 0.0
+            if important_keywords and content_text:
+                for kw in important_keywords:
+                    if kw.lower() in content_text:
+                        keyword_boost += 0.15  # Boost 0.15 per keyword match
+                        logger.debug(
+                            "[BOOST] Keyword '%s' found in section %s, boosting score",
+                            kw,
+                            ls.section_code,
+                        )
+            entries.append(
+                {
+                    "type": "legal",
+                    "score": float(score) + keyword_boost,
+                    "data": {
+                        "id": ls.id,
+                        "section_code": ls.section_code,
+                        "section_title": ls.section_title,
+                        "content": ls.content[:500] if ls.content else "",
+                        "excerpt": ls.excerpt,
+                        "document_code": doc_code,
+                        "document_title": doc.title if doc else None,
+                        "page_start": ls.page_start,
+                        "page_end": ls.page_end,
+                    },
+                }
+            )
+        if upper_detected:
+            exact_matches = [
+                r for r in entries if (r["data"].get("document_code") or "").upper() == upper_detected
+            ]
+            if exact_matches:
+                others = [r for r in entries if r not in exact_matches]
+                entries = exact_matches + others
+            else:
+                for entry in entries:
+                    doc_code = (entry["data"].get("document_code") or "").upper()
+                    if doc_code == upper_detected:
+                        entry["score"] = (entry.get("score") or 0.1) * 10
+                entries.sort(key=lambda r: r.get("score") or 0, reverse=True)
+        else:
+            # Sort by boosted score
+            entries.sort(key=lambda r: r.get("score") or 0, reverse=True)
+        return entries
+    def _is_complex_query(self, query: str) -> bool:
+        """
+        Detect if query is complex and requires LLM reasoning (not suitable for Fast Path).
+        Complex queries contain keywords like: %, bậc, thi đua, tỷ lệ, liên đới, tăng nặng, giảm nhẹ, đơn vị vi phạm
+        """
+        if not query:
+            return False
+        query_lower = query.lower()
+        complex_keywords = [
+            "%", "phần trăm",
+            "bậc", "hạ bậc", "nâng bậc",
+            "thi đua", "xếp loại", "đánh giá",
+            "tỷ lệ", "tỉ lệ",
+            "liên đới", "liên quan",
+            "tăng nặng", "tăng nặng hình phạt",
+            "giảm nhẹ", "giảm nhẹ hình phạt",
+            "đơn vị vi phạm", "đơn vị có",
+        ]
+        for keyword in complex_keywords:
+            if keyword in query_lower:
+                logger.info(
+                    "[FAST_PATH] Complex query detected (keyword: '%s'), forcing Slow Path",
+                    keyword,
+                )
+                return True
+        return False
+    def _maybe_fast_path_response(
+        self, results: List[Dict[str, Any]], query: Optional[str] = None
+    ) -> Optional[Dict[str, Any]]:
+        """Return fast-path response if results are confident enough."""
+        if not results:
+            return None
+        # Double-check: if query is complex, never use Fast Path
+        if query and self._is_complex_query(query):
+            return None
+        top_result = results[0]
+        top_score = top_result.get("score", 0.0) or 0.0
+        doc_code = (top_result.get("data", {}).get("document_code") or "").upper()
+        if top_score >= 0.88 and doc_code:
+            logger.info(
+                "[FAST_PATH] Top score hit (%.3f) for document %s", top_score, doc_code
+            )
+            message = self._format_fast_legal_message(top_result)
+            return {
+                "message": message,
+                "results": results[:3],
+                "count": min(3, len(results)),
+                "confidence": min(0.99, top_score + 0.05),
+            }
+        top_three = results[:3]
+        if len(top_three) >= 2:
+            doc_codes = [
+                (res.get("data", {}).get("document_code") or "").upper()
+                for res in top_three
+                if res.get("data", {}).get("document_code")
+            ]
+            if doc_codes and len(set(doc_codes)) == 1:
+                logger.info(
+                    "[FAST_PATH] Top-%d results share same document %s",
+                    len(top_three),
+                    doc_codes[0],
+                )
+                message = self._format_fast_legal_message(top_three[0])
+                return {
+                    "message": message,
+                    "results": top_three,
+                    "count": len(top_three),
+                    "confidence": min(0.97, (top_three[0].get("score") or 0.9) + 0.04),
+                }
+        return None
+    def _format_fast_legal_message(self, result: Dict[str, Any]) -> str:
+        """Format a concise legal answer without LLM."""
+        data = result.get("data", {})
+        doc_title = data.get("document_title") or "văn bản pháp luật"
+        doc_code = data.get("document_code") or ""
+        section_code = data.get("section_code") or "Điều liên quan"
+        section_title = data.get("section_title") or ""
+        content = (data.get("content") or data.get("excerpt") or "").strip()
+        if len(content) > 400:
+            trimmed = content[:400].rsplit(" ", 1)[0]
+            content = f"{trimmed}..."
+        intro = "Kết quả chính xác nhất:"
+        lines = [intro]
+        if doc_title or doc_code:
+            lines.append(f"- Văn bản: {doc_title or 'văn bản pháp luật'}" + (f" ({doc_code})" if doc_code else ""))
+        section_label = section_code
+        if section_title:
+            section_label = f"{section_code} – {section_title}"
+        lines.append(f"- Điều khoản: {section_label}")
+        lines.append("")
+        lines.append(content)
+        citation_doc = doc_title or doc_code or "nguồn chính thức"
+        lines.append(f"\nNguồn: {section_label}, {citation_doc}.")
+        return "\n".join(lines)

backend/hue_portal/chatbot/structured_legal.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+Structured legal answer helpers using LangChain output parsers.
+"""
+from __future__ import annotations
+import json
+import logging
+import textwrap
+from functools import lru_cache
+from typing import List, Optional, Sequence
+from langchain.output_parsers import PydanticOutputParser
+from langchain.schema import OutputParserException
+from pydantic import BaseModel, Field
+logger = logging.getLogger(__name__)
+class LegalCitation(BaseModel):
+    """Single citation item pointing back to a legal document."""
+    document_title: str = Field(..., description="Tên văn bản pháp luật.")
+    section_code: str = Field(..., description="Mã điều/khoản được trích dẫn.")
+    page_range: Optional[str] = Field(
+        None, description="Trang hoặc khoảng trang trong tài liệu."
+    )
+    summary: str = Field(
+        ...,
+        description="1-2 câu mô tả nội dung chính của trích dẫn, phải liên quan trực tiếp câu hỏi.",
+    )
+    snippet: str = Field(
+        ..., description="Trích đoạn ngắn gọn (≤500 ký tự) lấy từ tài liệu gốc."
+    )
+class LegalAnswer(BaseModel):
+    """Structured answer returned by the LLM."""
+    summary: str = Field(
+        ...,
+        description="Đoạn mở đầu tóm tắt kết luận chính, phải nhắc văn bản áp dụng (ví dụ Quyết định 69/QĐ-TW).",
+    )
+    details: List[str] = Field(
+        ...,
+        description="Tối thiểu 2 gạch đầu dòng mô tả từng hình thức/điều khoản. Mỗi gạch đầu dòng phải nhắc mã điều hoặc tên văn bản.",
+    )
+    citations: List[LegalCitation] = Field(
+        ...,
+        description="Danh sách trích dẫn; phải có ít nhất 1 phần tử tương ứng với các tài liệu đã cung cấp.",
+    )
+@lru_cache(maxsize=1)
+def get_legal_output_parser() -> PydanticOutputParser:
+    """Return cached parser to enforce structured output."""
+    return PydanticOutputParser(pydantic_object=LegalAnswer)
+def build_structured_legal_prompt(
+    query: str,
+    documents: Sequence,
+    parser: PydanticOutputParser,
+    prefill_summary: Optional[str] = None,
+    retry_hint: Optional[str] = None,
+) -> str:
+    """Construct prompt instructing the LLM to return structured JSON."""
+    doc_blocks = []
+    # 4 chunks for good context and speed balance
+    for idx, doc in enumerate(documents[:4], 1):
+        document = getattr(doc, "document", None)
+        title = getattr(document, "title", "") or "Không rõ tên văn bản"
+        code = getattr(document, "code", "") or "N/A"
+        section_code = getattr(doc, "section_code", "") or "Không rõ điều"
+        section_title = getattr(doc, "section_title", "") or ""
+        page_range = _format_page_range(doc)
+        content = getattr(doc, "content", "") or ""
+        # Increased snippet to 500 chars to use more RAM and provide better context
+        snippet = (content[:500] + "...") if len(content) > 500 else content
+        block = textwrap.dedent(
+            f"""
+            TÀI LIỆU #{idx}
+            Văn bản: {title} (Mã: {code})
+            Điều/khoản: {section_code} - {section_title}
+            Trang: {page_range or 'Không rõ'}
+            Trích đoạn:
+            {snippet}
+            """
+        ).strip()
+        doc_blocks.append(block)
+    docs_text = "\n\n".join(doc_blocks)
+    reference_lines = []
+    title_section_pairs = []
+    # 4 chunks to match doc_blocks for balance
+    for doc in documents[:4]:
+        document = getattr(doc, "document", None)
+        title = getattr(document, "title", "") or "Không rõ tên văn bản"
+        section_code = getattr(doc, "section_code", "") or "Không rõ điều"
+        reference_lines.append(f"- {title} | {section_code}")
+        title_section_pairs.append((title, section_code))
+    reference_text = "\n".join(reference_lines)
+    prefill_block = ""
+    if prefill_summary:
+        prefill_block = textwrap.dedent(
+            f"""
+            Bản tóm tắt tiếng Việt đã có sẵn (hãy dùng lại, diễn đạt ngắn gọn hơn, KHÔNG thêm thông tin mới):
+            {prefill_summary.strip()}
+            """
+        ).strip()
+    format_instructions = parser.get_format_instructions()
+    retry_hint_block = ""
+    if retry_hint:
+        retry_hint_block = textwrap.dedent(
+            f"""
+            Nhắc lại: {retry_hint.strip()}
+            """
+        ).strip()
+    prompt = textwrap.dedent(
+        f"""
+        Bạn là chuyên gia tư vấn về xử lí kỷ luật cán bộ đảng viên của Phòng Thanh Tra - Công An Thành Phố Huế. Chỉ trả lời dựa trên context được cung cấp, không suy diễn hay tạo thông tin mới.
+        Câu hỏi: {query}
+        Context được sắp xếp theo độ liên quan giảm dần (tài liệu #1 là liên quan nhất):
+        {docs_text}
+        Bảng tham chiếu (chỉ s�� dụng đúng tên/mã dưới đây):
+        {reference_text}
+        Quy tắc bắt buộc:
+        1. CHỈ trả lời dựa trên thông tin trong context ở trên, không tự tạo hoặc suy đoán.
+        2. Phải nhắc rõ văn bản (ví dụ: Thông tư 02 về xử lý điều lệnh trong CAND) và mã điều/khoản chính xác (ví dụ: Điều 7, Điều 8).
+        3. Nếu câu hỏi về tỷ lệ phần trăm, hạ bậc thi đua, xếp loại → phải tìm đúng điều khoản quy định về tỷ lệ đó.
+        4. Nếu KHÔNG tìm thấy thông tin về tỷ lệ %, hạ bậc thi đua trong context → trả lời rõ: "Thông tư 02 không quy định xử lý đơn vị theo tỷ lệ phần trăm vi phạm trong năm" (đừng trích bừa điều khoản khác).
+        5. Cấu trúc trả lời:
+           - SUMMARY: Tóm tắt ngắn gọn kết luận chính, nhắc văn bản và điều khoản áp dụng
+           - DETAILS: Tối thiểu 2 bullet, mỗi bullet phải có mã điều/khoản và nội dung cụ thể
+           - CITATIONS: Danh sách trích dẫn với document_title, section_code, snippet ≤500 ký tự
+        6. Tuyệt đối không chép lại schema hay thêm khóa "$defs"; chỉ xuất đối tượng JSON cuối cùng.
+        7. Chỉ in ra CHÍNH XÁC một JSON object, không thêm chữ 'json', không dùng ``` hoặc văn bản thừa.
+        Ví dụ định dạng:
+        {{
+          "summary": "Theo Thông tư 02 về xử lý điều lệnh trong CAND, đơn vị có 12% cán bộ vi phạm điều lệnh trong năm sẽ bị hạ 1 bậc thi đua (Điều 7).",
+          "details": [
+            "- Điều 7 quy định: Đơn vị có từ 10% đến dưới 20% cán bộ vi phạm điều lệnh trong năm sẽ bị hạ 1 bậc thi đua.",
+            "- Điều 8 quy định: Đơn vị có từ 20% trở lên cán bộ vi phạm điều lệnh trong năm sẽ bị hạ 2 bậc thi đua."
+          ],
+            "citations": [
+              {{
+              "document_title": "Thông tư 02 về xử lý điều lệnh trong CAND",
+              "section_code": "Điều 7",
+              "page_range": "5-6",
+              "summary": "Quy định về hạ bậc thi đua theo tỷ lệ vi phạm",
+              "snippet": "Đơn vị có từ 10% đến dưới 20% cán bộ vi phạm điều lệnh trong năm sẽ bị hạ 1 bậc thi đua..."
+              }}
+            ]
+          }}
+        {prefill_block}
+        {retry_hint_block}
+        {format_instructions}
+        """
+    ).strip()
+    return prompt
+def format_structured_legal_answer(answer: LegalAnswer) -> str:
+    """Convert structured answer into human-friendly text with citations."""
+    lines: List[str] = []
+    if answer.summary:
+        lines.append(answer.summary.strip())
+    if answer.details:
+        lines.append("")
+        lines.append("Chi tiết chính:")
+        for bullet in answer.details:
+            lines.append(f"- {bullet.strip()}")
+    if answer.citations:
+        lines.append("")
+        lines.append("Trích dẫn chi tiết:")
+        for idx, citation in enumerate(answer.citations, 1):
+            page_text = f" (Trang: {citation.page_range})" if citation.page_range else ""
+            lines.append(
+                f"{idx}. {citation.document_title} – {citation.section_code}{page_text}"
+            )
+            lines.append(f"   Tóm tắt: {citation.summary.strip()}")
+            lines.append(f"   Trích đoạn: {citation.snippet.strip()}")
+    return "\n".join(lines).strip()
+def _format_page_range(doc: object) -> Optional[str]:
+    start = getattr(doc, "page_start", None)
+    end = getattr(doc, "page_end", None)
+    if start and end:
+        if start == end:
+            return str(start)
+        return f"{start}-{end}"
+    if start:
+        return str(start)
+    if end:
+        return str(end)
+    return None
+def parse_structured_output(
+    parser: PydanticOutputParser, raw_output: str
+) -> Optional[LegalAnswer]:
+    """Parse raw LLM output to LegalAnswer if possible."""
+    if not raw_output:
+        return None
+    try:
+        return parser.parse(raw_output)
+    except OutputParserException:
+        snippet = raw_output.strip().replace("\n", " ")
+        logger.warning(
+            "[LLM] Structured parse failed. Preview: %s",
+            snippet[:400],
+        )
+        json_candidate = _extract_json_block(raw_output)
+        if json_candidate:
+            try:
+                return parser.parse(json_candidate)
+            except OutputParserException:
+                logger.warning("[LLM] JSON reparse also failed.")
+                return None
+        return None
+def _extract_json_block(text: str) -> Optional[str]:
+    """
+    Best-effort extraction of the first JSON object within text.
+    """
+    stripped = text.strip()
+    if stripped.startswith("```"):
+        stripped = stripped.lstrip("`")
+        if stripped.lower().startswith("json"):
+            stripped = stripped[4:]
+        stripped = stripped.strip("`").strip()
+    start = text.find("{")
+    if start == -1:
+        return None
+    stack = 0
+    for idx in range(start, len(text)):
+        char = text[idx]
+        if char == "{":
+            stack += 1
+        elif char == "}":
+            stack -= 1
+            if stack == 0:
+                payload = text[start : idx + 1]
+                # Remove code fences if present
+                payload = payload.strip()
+                if payload.startswith("```"):
+                    payload = payload.strip("`").strip()
+                try:
+                    json.loads(payload)
+                    return payload
+                except json.JSONDecodeError:
+                    return None
+    return None

backend/hue_portal/chatbot/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Test suite for chatbot module."""

backend/hue_portal/chatbot/tests/__pycache__/test_smoke.cpython-310.pyc ADDED Viewed

Binary file (1.71 kB). View file

backend/hue_portal/chatbot/tests/test_intent_keywords.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import unittest
+from hue_portal.chatbot.chatbot import Chatbot
+class IntentKeywordTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.bot = Chatbot()
+    def test_office_keywords_have_priority(self):
+        intent, confidence = self.bot.classify_intent("Cho mình địa chỉ Công an phường An Cựu", context=None)
+        self.assertEqual(intent, "search_office")
+        self.assertGreaterEqual(confidence, 0.7)
+    def test_document_code_forces_search_legal(self):
+        intent, confidence = self.bot.classify_intent("Quyết định 69 quy định gì về kỷ luật?", context=None)
+        self.assertEqual(intent, "search_legal")
+        self.assertGreaterEqual(confidence, 0.8)
+    def test_fine_keywords_override_greeting(self):
+        intent, confidence = self.bot.classify_intent("Chào bạn mức phạt vượt đèn đỏ là bao nhiêu", context=None)
+        self.assertEqual(intent, "search_fine")
+        self.assertGreaterEqual(confidence, 0.8)
+if __name__ == "__main__":
+    unittest.main()

backend/hue_portal/chatbot/tests/test_intent_training.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import json
+from pathlib import Path
+import unittest
+from hue_portal.chatbot.training import train_intent
+class IntentTrainingTestCase(unittest.TestCase):
+    def test_train_pipeline_produces_artifacts(self):
+        model_path, metrics_path, metrics = train_intent.train(train_intent.DEFAULT_DATASET, test_size=0.3, random_state=123)
+        self.assertTrue(model_path.exists(), "Model artifact should be created")
+        self.assertTrue(metrics_path.exists(), "Metrics file should be created")
+        payload = json.loads(metrics_path.read_text(encoding="utf-8"))
+        self.assertIn("accuracy", payload)
+        self.assertGreaterEqual(payload["accuracy"], 0.0)
+        self.assertLessEqual(payload["accuracy"], 1.0)
+if __name__ == "__main__":
+    unittest.main()

backend/hue_portal/chatbot/tests/test_router.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from django.test import SimpleTestCase
+from hue_portal.chatbot.router import IntentRoute, decide_route
+class RouterDecisionTests(SimpleTestCase):
+    def test_simple_greeting_routed_to_greeting(self):
+        decision = decide_route("chào bạn", "greeting", 0.9)
+        self.assertEqual(decision.route, IntentRoute.GREETING)
+        self.assertEqual(decision.forced_intent, "greeting")
+    def test_doc_code_forces_search_legal(self):
+        decision = decide_route("Cho tôi xem quyết định 69 nói gì", "general_query", 0.4)
+        self.assertEqual(decision.route, IntentRoute.SEARCH)
+        self.assertEqual(decision.forced_intent, "search_legal")
+    def test_low_confidence_goes_to_small_talk(self):
+        decision = decide_route("tôi mệt quá", "general_query", 0.2)
+        self.assertEqual(decision.route, IntentRoute.SMALL_TALK)
+        self.assertEqual(decision.forced_intent, "general_query")
+    def test_confident_fine_query_stays_search(self):
+        decision = decide_route("mức phạt vượt đèn đỏ là gì", "search_fine", 0.92)
+        self.assertEqual(decision.route, IntentRoute.SEARCH)
+        self.assertIsNone(decision.forced_intent)
+    def test_small_talk_routes_to_small_talk(self):
+        decision = decide_route("mệt quá hôm nay", "general_query", 0.4)
+        self.assertEqual(decision.route, IntentRoute.SMALL_TALK)
+        self.assertEqual(decision.forced_intent, "general_query")
+    def test_keyword_override_forces_fine_intent(self):
+        decision = decide_route("phạt vượt đèn đỏ sao vậy", "general_query", 0.5)
+        self.assertEqual(decision.route, IntentRoute.SEARCH)
+        self.assertEqual(decision.forced_intent, "search_fine")
+    def test_keyword_override_forces_procedure_intent(self):
+        decision = decide_route("thủ tục cư trú cần hồ sơ gì", "general_query", 0.5)
+        self.assertEqual(decision.route, IntentRoute.SEARCH)
+        self.assertEqual(decision.forced_intent, "search_procedure")

backend/hue_portal/chatbot/tests/test_smoke.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Smoke tests to ensure chatbot + essential management commands work."""
+from __future__ import annotations
+from django.core.management import call_command, load_command_class
+from django.test import TestCase
+from hue_portal.chatbot.chatbot import get_chatbot
+class ChatbotSmokeTests(TestCase):
+    """Verify chatbot core components can initialize without errors."""
+    def test_chatbot_initializes_once(self) -> None:
+        bot = get_chatbot()
+        self.assertIsNotNone(bot)
+        # Intent classifier should be available after initialization/training
+        self.assertIsNotNone(bot.intent_classifier)
+class ManagementCommandSmokeTests(TestCase):
+    """Ensure critical management commands are wired correctly."""
+    def test_django_check_command(self) -> None:
+        call_command("check")
+    def test_retry_ingestion_command_loads(self) -> None:
+        load_command_class("hue_portal.core", "retry_ingestion_job")