Spaces:

HoangDaoAI
/

toxic-api

Running

App Files Files Community

handrix commited on 7 days ago

Commit

ae4e2a6

1 Parent(s): bc7497c

Initial deployment - Toxic Detection API

Browse files

Files changed (24) hide show

.dockerignore +63 -0
.env.example +16 -0
.gitattributes +1 -0
Dockerfile +49 -0
app/__init__.py +0 -0
app/api/__init__.py +0 -0
app/api/routes.py +113 -0
app/core/__init__.py +0 -0
app/core/config.py +46 -0
app/core/exceptions.py +44 -0
app/main.py +115 -0
app/models/__init__.py +0 -0
app/models/model_loader.py +123 -0
app/models/phobert_model.py +105 -0
app/schemas/__init__.py +0 -0
app/schemas/requests.py +52 -0
app/schemas/responses.py +138 -0
app/services/__init__.py +0 -0
app/services/analysis_service.py +318 -0
app/services/gradient_service.py +167 -0
app/services/html_generator.py +130 -0
app/services/text_processor.py +142 -0
models/PhoBERTFineTuned_best.pth +3 -0
requirements.txt +9 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,63 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+venv/
+ENV/
+env/
+.venv
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Git
+.git/
+.gitignore
+# Tests
+tests/
+.pytest_cache/
+.coverage
+htmlcov/
+# Documentation
+*.md
+!README.md
+# Environment
+.env
+.env.local
+# Logs
+*.log
+# Other
+*.tar.gz
+*.zip

.env.example ADDED Viewed

	@@ -0,0 +1,16 @@

+# Model Configuration
+MODEL_NAME=vinai/phobert-base
+MODEL_PATH=./models/PhoBERTFineTuned_best.pth
+MAX_LENGTH=128
+DEVICE=cuda
+# API Configuration
+API_HOST=0.0.0.0
+API_PORT=8000
+API_RELOAD=True
+# CORS
+ALLOWED_ORIGINS=*
+# Logging
+LOG_LEVEL=INFO

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/*.pth filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,49 @@

+# Multi-stage build for smaller image size
+FROM python:3.10-slim as builder
+# Set working directory
+WORKDIR /app
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --user -r requirements.txt
+# Final stage
+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Copy Python packages from builder
+COPY --from=builder /root/.local /root/.local
+# Make sure scripts in .local are usable
+ENV PATH=/root/.local/bin:$PATH
+# Copy application code
+COPY ./app ./app
+# Create models directory
+RUN mkdir -p ./models
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV MODEL_PATH=/app/models/PhoBERTFineTuned_best.pth
+# Expose port
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:7860/api/v1/health')"
+# Run the application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/__init__.py ADDED Viewed

File without changes

app/api/__init__.py ADDED Viewed

File without changes

app/api/routes.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+API Routes
+==========
+FastAPI routes (Interface Segregation)
+"""
+from fastapi import APIRouter, Depends, HTTPException, status
+from typing import Dict
+from app.schemas.requests import AnalysisRequest
+from app.schemas.responses import AnalysisResponse, HealthResponse, ErrorResponse
+from app.services.analysis_service import analysis_service
+from app.models.model_loader import model_loader
+from app.core.config import settings
+from app.core.exceptions import ModelNotLoadedException, AnalysisException
+router = APIRouter()
+@router.get(
+    "/",
+    response_model=Dict[str, str],
+    summary="Root endpoint",
+    tags=["General"]
+)
+async def root():
+    """Root endpoint - API information"""
+    return {
+        "message": "Toxic Text Detection API",
+        "version": settings.API_VERSION,
+        "docs": "/docs",
+        "health": "/api/v1/health"
+    }
+@router.get(
+    "/health",
+    response_model=HealthResponse,
+    summary="Health check",
+    tags=["General"]
+)
+async def health_check():
+    """
+    Health check endpoint
+    Returns service status and model information
+    """
+    return HealthResponse(
+        status="healthy" if model_loader.is_loaded() else "unhealthy",
+        model_loaded=model_loader.is_loaded(),
+        device=str(model_loader.device) if model_loader.is_loaded() else "unknown",
+        model_name=settings.MODEL_NAME,
+        version=settings.API_VERSION
+    )
+@router.post(
+    "/analyze",
+    response_model=AnalysisResponse,
+    responses={
+        200: {"description": "Analysis successful"},
+        400: {"model": ErrorResponse, "description": "Invalid input"},
+        500: {"model": ErrorResponse, "description": "Analysis failed"},
+        503: {"model": ErrorResponse, "description": "Model not loaded"}
+    },
+    summary="Analyze text for toxicity",
+    tags=["Analysis"]
+)
+async def analyze_text(request: AnalysisRequest):
+    """
+    Analyze text for toxic content
+    This endpoint analyzes Vietnamese text to detect toxic content using
+    a fine-tuned PhoBERT model with gradient-based explainability.
+    **Features:**
+    - Sentence-level toxicity detection
+    - Word-level importance scores
+    - HTML highlighting of toxic content
+    - Detailed statistics
+    **Parameters:**
+    - **text**: Text to analyze (required)
+    - **include_html**: Include HTML highlighting (default: true)
+    - **include_word_scores**: Include word-level scores (default: true)
+    - **include_summary_table**: Include summary table (default: false)
+    **Returns:**
+    - Overall toxicity label (toxic/clean)
+    - Sentence-level analysis
+    - Word-level scores and toxic words summary
+    - HTML with highlighted toxic content
+    - Statistical information
+    """
+    # Check if model is loaded
+    if not model_loader.is_loaded():
+        raise ModelNotLoadedException()
+    # Perform analysis
+    try:
+        result = analysis_service.analyze(request)
+        return result
+    except AnalysisException as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=str(e)
+        )
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Unexpected error: {str(e)}"
+        )

app/core/__init__.py ADDED Viewed

File without changes

app/core/config.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+Core Configuration
+==================
+Application settings using Pydantic Settings
+"""
+from pydantic_settings import BaseSettings
+from typing import List
+import torch
+class Settings(BaseSettings):
+    """Application settings"""
+    # Model Configuration
+    MODEL_NAME: str = "vinai/phobert-base"
+    MODEL_PATH: str = "./models/PhoBERTFineTuned_best.pth"
+    MAX_LENGTH: int = 128
+    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
+    # API Configuration
+    API_TITLE: str = "Toxic Text Detection API"
+    API_VERSION: str = "1.0.0"
+    API_DESCRIPTION: str = "Vietnamese toxic text detection with gradient-based explainability"
+    API_HOST: str = "0.0.0.0"
+    API_PORT: int = 8000
+    API_RELOAD: bool = True
+    # CORS
+    ALLOWED_ORIGINS: List[str] = ["*"]
+    # Analysis Settings
+    GRADIENT_STEPS: int = 20
+    PERCENTILE_THRESHOLD: int = 75
+    MIN_WORD_LENGTH: int = 2
+    # Logging
+    LOG_LEVEL: str = "INFO"
+    class Config:
+        env_file = ".env"
+        case_sensitive = True
+# Singleton instance
+settings = Settings()

app/core/exceptions.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Custom Exceptions
+=================
+Application-specific exceptions
+"""
+from fastapi import HTTPException, status
+class ToxicDetectionException(HTTPException):
+    """Base exception for toxic detection"""
+    def __init__(self, detail: str, status_code: int = status.HTTP_500_INTERNAL_SERVER_ERROR):
+        super().__init__(status_code=status_code, detail=detail)
+class ModelNotLoadedException(ToxicDetectionException):
+    """Raised when model is not loaded"""
+    def __init__(self):
+        super().__init__(
+            detail="Model not loaded. Please check server logs.",
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE
+        )
+class InvalidTextException(ToxicDetectionException):
+    """Raised when input text is invalid"""
+    def __init__(self, detail: str = "Invalid text input"):
+        super().__init__(
+            detail=detail,
+            status_code=status.HTTP_400_BAD_REQUEST
+        )
+class AnalysisException(ToxicDetectionException):
+    """Raised when analysis fails"""
+    def __init__(self, detail: str = "Analysis failed"):
+        super().__init__(
+            detail=detail,
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR
+        )

app/main.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Main FastAPI Application
+=========================
+Application entry point
+"""
+import logging
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from app.core.config import settings
+from app.core.exceptions import ToxicDetectionException
+from app.models.model_loader import model_loader
+from app.api.routes import router
+# Configure logging
+logging.basicConfig(
+    level=getattr(logging, settings.LOG_LEVEL),
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Lifespan events
+    Startup: Load model
+    Shutdown: Cleanup
+    """
+    # Startup
+    logger.info("Starting up...")
+    try:
+        logger.info("Loading model...")
+        model_loader.load()
+        logger.info("Model loaded successfully")
+    except Exception as e:
+        logger.error(f"Failed to load model: {str(e)}")
+        # Continue anyway - health endpoint will show model not loaded
+    yield
+    # Shutdown
+    logger.info("Shutting down...")
+# Create FastAPI app
+app = FastAPI(
+    title=settings.API_TITLE,
+    description=settings.API_DESCRIPTION,
+    version=settings.API_VERSION,
+    lifespan=lifespan,
+    docs_url="/docs",
+    redoc_url="/redoc",
+    openapi_url="/openapi.json"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.ALLOWED_ORIGINS,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Exception handlers
+@app.exception_handler(ToxicDetectionException)
+async def toxic_detection_exception_handler(request, exc: ToxicDetectionException):
+    """Handle custom exceptions"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "success": False,
+            "error": exc.detail,
+            "detail": None
+        }
+    )
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc: Exception):
+    """Handle general exceptions"""
+    logger.error(f"Unhandled exception: {str(exc)}", exc_info=True)
+    return JSONResponse(
+        status_code=500,
+        content={
+            "success": False,
+            "error": "Internal server error",
+            "detail": str(exc) if settings.LOG_LEVEL == "DEBUG" else None
+        }
+    )
+# Include routers
+app.include_router(router, prefix="/api/v1", tags=["v1"])
+app.include_router(router, prefix="", tags=["root"])
+# For direct run
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "app.main:app",
+        host=settings.API_HOST,
+        port=settings.API_PORT,
+        reload=settings.API_RELOAD,
+        log_level=settings.LOG_LEVEL.lower()
+    )

app/models/__init__.py ADDED Viewed

File without changes

app/models/model_loader.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Model Loader
+============
+Responsible for loading and initializing models (Single Responsibility)
+"""
+import torch
+from transformers import AutoModel, AutoTokenizer
+from typing import Tuple
+import logging
+from app.models.phobert_model import PhoBERTFineTuned
+from app.core.config import settings
+from app.core.exceptions import ModelNotLoadedException
+logger = logging.getLogger(__name__)
+class ModelLoader:
+    """
+    Model loader service
+    Responsibilities:
+    - Load tokenizer
+    - Load base model
+    - Load fine-tuned weights
+    - Initialize model on correct device
+    """
+    def __init__(self):
+        self._model: PhoBERTFineTuned | None = None
+        self._tokenizer: AutoTokenizer | None = None
+        self._device: torch.device | None = None
+    def load(self) -> Tuple[PhoBERTFineTuned, AutoTokenizer, torch.device]:
+        """
+        Load model, tokenizer, and set device
+        Returns:
+            model: Loaded model
+            tokenizer: Loaded tokenizer
+            device: Device (CPU/CUDA)
+        Raises:
+            ModelNotLoadedException: If loading fails
+        """
+        try:
+            # Set device
+            self._device = torch.device(settings.DEVICE)
+            logger.info(f"Using device: {self._device}")
+            # Load tokenizer
+            logger.info(f"Loading tokenizer: {settings.MODEL_NAME}")
+            self._tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME)
+            # Load base model
+            logger.info(f"Loading base model: {settings.MODEL_NAME}")
+            phobert = AutoModel.from_pretrained(settings.MODEL_NAME)
+            # Initialize fine-tuned model
+            logger.info("Initializing fine-tuned model")
+            self._model = PhoBERTFineTuned(
+                embedding_model=phobert,
+                hidden_dim=768,
+                dropout=0.3,
+                num_classes=2,
+                num_layers_to_finetune=4,
+                pooling='mean'
+            )
+            # Load weights
+            logger.info(f"Loading weights from: {settings.MODEL_PATH}")
+            state_dict = torch.load(
+                settings.MODEL_PATH,
+                map_location=self._device
+            )
+            self._model.load_state_dict(state_dict)
+            # Move to device and set eval mode
+            self._model = self._model.to(self._device)
+            self._model.eval()
+            logger.info("Model loaded successfully")
+            return self._model, self._tokenizer, self._device
+        except Exception as e:
+            logger.error(f"Failed to load model: {str(e)}")
+            raise ModelNotLoadedException()
+    @property
+    def model(self) -> PhoBERTFineTuned:
+        """Get loaded model"""
+        if self._model is None:
+            raise ModelNotLoadedException()
+        return self._model
+    @property
+    def tokenizer(self) -> AutoTokenizer:
+        """Get loaded tokenizer"""
+        if self._tokenizer is None:
+            raise ModelNotLoadedException()
+        return self._tokenizer
+    @property
+    def device(self) -> torch.device:
+        """Get device"""
+        if self._device is None:
+            raise ModelNotLoadedException()
+        return self._device
+    def is_loaded(self) -> bool:
+        """Check if model is loaded"""
+        return all([
+            self._model is not None,
+            self._tokenizer is not None,
+            self._device is not None
+        ])
+# Singleton instance
+model_loader = ModelLoader()

app/models/phobert_model.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+PhoBERT Model
+=============
+Model architecture definition (Single Responsibility)
+"""
+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+class PhoBERTFineTuned(nn.Module):
+    """
+    Fine-tuned PhoBERT model for toxic text classification
+    Responsibilities:
+    - Define model architecture
+    - Forward pass computation
+    """
+    def __init__(
+        self,
+        embedding_model: nn.Module,
+        hidden_dim: int = 768,
+        dropout: float = 0.3,
+        num_classes: int = 2,
+        num_layers_to_finetune: int = 4,
+        pooling: str = 'mean'
+    ):
+        super(PhoBERTFineTuned, self).__init__()
+        self.embedding = embedding_model
+        self.pooling = pooling
+        self.num_layers_to_finetune = num_layers_to_finetune
+        # Freeze all parameters
+        for param in self.embedding.parameters():
+            param.requires_grad = False
+        # Unfreeze last N layers
+        if num_layers_to_finetune > 0:
+            total_layers = len(self.embedding.encoder.layer)
+            layers_to_train = list(range(
+                total_layers - num_layers_to_finetune,
+                total_layers
+            ))
+            for layer_idx in layers_to_train:
+                for param in self.embedding.encoder.layer[layer_idx].parameters():
+                    param.requires_grad = True
+            if hasattr(self.embedding, 'pooler') and self.embedding.pooler is not None:
+                for param in self.embedding.pooler.parameters():
+                    param.requires_grad = True
+        # Classification head
+        self.dropout = nn.Dropout(dropout)
+        self.fc1 = nn.Linear(hidden_dim, 256)
+        self.fc2 = nn.Linear(256, num_classes)
+        self.relu = nn.ReLU()
+        self.layer_norm = nn.LayerNorm(hidden_dim)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        return_embeddings: bool = False
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Forward pass
+        Args:
+            input_ids: Input token IDs
+            attention_mask: Attention mask
+            return_embeddings: Whether to return embeddings
+        Returns:
+            logits: Classification logits
+            embeddings: Hidden states (if return_embeddings=True)
+        """
+        # Get embeddings
+        outputs = self.embedding(input_ids, attention_mask=attention_mask)
+        embeddings = outputs.last_hidden_state
+        # Pooling
+        if self.pooling == 'cls':
+            pooled = embeddings[:, 0, :]
+        elif self.pooling == 'mean':
+            mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
+            sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
+            sum_mask = mask_expanded.sum(1)
+            pooled = sum_embeddings / sum_mask
+        else:
+            raise ValueError(f"Unknown pooling method: {self.pooling}")
+        # Classification
+        pooled = self.layer_norm(pooled)
+        out = self.dropout(pooled)
+        out = self.relu(self.fc1(out))
+        out = self.dropout(out)
+        logits = self.fc2(out)
+        if return_embeddings:
+            return logits, embeddings
+        return logits, None

app/schemas/__init__.py ADDED Viewed

File without changes

app/schemas/requests.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Request Schemas
+===============
+DTOs for API requests
+"""
+from pydantic import BaseModel, Field, field_validator
+class AnalysisRequest(BaseModel):
+    """Request for text analysis"""
+    text: str = Field(
+        ...,
+        description="Text to analyze for toxicity",
+        min_length=1,
+        max_length=5000,
+        examples=["Đồ ngu ngốc, mất dạy!"]
+    )
+    include_html: bool = Field(
+        default=True,
+        description="Include HTML highlighting in response"
+    )
+    include_word_scores: bool = Field(
+        default=True,
+        description="Include detailed word-level scores"
+    )
+    include_summary_table: bool = Field(
+        default=False,
+        description="Include summary table of all words"
+    )
+    @field_validator('text')
+    @classmethod
+    def validate_text(cls, v: str) -> str:
+        """Validate text input"""
+        if not v or not v.strip():
+            raise ValueError("Text cannot be empty or only whitespace")
+        return v.strip()
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "text": "Đồ ngu ngốc, mất dạy! Cảm ơn bạn đã chia sẻ.",
+                "include_html": True,
+                "include_word_scores": True,
+                "include_summary_table": False
+            }
+        }

app/schemas/responses.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+Response Schemas
+================
+DTOs for API responses
+"""
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict
+from enum import Enum
+class SentimentLabel(str, Enum):
+    """Sentiment labels"""
+    TOXIC = "toxic"
+    CLEAN = "clean"
+class WordScore(BaseModel):
+    """Word-level score information"""
+    word: str = Field(..., description="The word")
+    score: float = Field(..., ge=0.0, le=1.0, description="Toxicity score (0-1)")
+    position: Dict[str, int] = Field(..., description="Position in text {start, end}")
+    is_toxic: bool = Field(..., description="Whether word is toxic")
+    is_stop_word: bool = Field(..., description="Whether word is a stop word")
+class SentenceResult(BaseModel):
+    """Sentence-level analysis result"""
+    sentence_number: int = Field(..., description="Sentence index (1-based)")
+    text: str = Field(..., description="Sentence text")
+    label: SentimentLabel = Field(..., description="Toxic or clean")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score")
+    threshold: float = Field(..., ge=0.0, le=1.0, description="Threshold used")
+    word_count: int = Field(..., description="Number of words")
+    word_scores: Optional[List[WordScore]] = Field(None, description="Word-level scores")
+class ToxicWordSummary(BaseModel):
+    """Summary of toxic words"""
+    word: str = Field(..., description="Toxic word")
+    score: float = Field(..., ge=0.0, le=1.0, description="Maximum score")
+    occurrences: int = Field(..., description="Number of occurrences")
+    sentences: List[int] = Field(..., description="Sentence numbers containing this word")
+class Statistics(BaseModel):
+    """Overall statistics"""
+    total_words: int = Field(..., description="Total number of words")
+    toxic_words: int = Field(..., description="Number of toxic words")
+    mean_score: float = Field(..., ge=0.0, le=1.0, description="Mean toxicity score")
+    median_score: float = Field(..., ge=0.0, le=1.0, description="Median toxicity score")
+    max_score: float = Field(..., ge=0.0, le=1.0, description="Maximum toxicity score")
+    min_score: float = Field(..., ge=0.0, le=1.0, description="Minimum toxicity score")
+class AnalysisResponse(BaseModel):
+    """Complete analysis response"""
+    success: bool = Field(True, description="Whether analysis succeeded")
+    text: str = Field(..., description="Original input text")
+    overall_label: SentimentLabel = Field(..., description="Overall text sentiment")
+    toxic_sentence_count: int = Field(..., description="Number of toxic sentences")
+    clean_sentence_count: int = Field(..., description="Number of clean sentences")
+    total_sentences: int = Field(..., description="Total number of sentences")
+    sentences: List[SentenceResult] = Field(..., description="Sentence-level results")
+    toxic_words_summary: List[ToxicWordSummary] = Field(..., description="Summary of toxic words")
+    statistics: Statistics = Field(..., description="Overall statistics")
+    html_highlighted: Optional[str] = Field(None, description="HTML with highlighting")
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "success": True,
+                "text": "Đồ ngu ngốc!",
+                "overall_label": "toxic",
+                "toxic_sentence_count": 1,
+                "clean_sentence_count": 0,
+                "total_sentences": 1,
+                "sentences": [
+                    {
+                        "sentence_number": 1,
+                        "text": "Đồ ngu ngốc!",
+                        "label": "toxic",
+                        "confidence": 0.998,
+                        "threshold": 0.62,
+                        "word_count": 3,
+                        "word_scores": [
+                            {
+                                "word": "Đồ",
+                                "score": 0.902,
+                                "position": {"start": 0, "end": 2},
+                                "is_toxic": True,
+                                "is_stop_word": False
+                            }
+                        ]
+                    }
+                ],
+                "toxic_words_summary": [
+                    {
+                        "word": "ngu",
+                        "score": 0.924,
+                        "occurrences": 1,
+                        "sentences": [1]
+                    }
+                ],
+                "statistics": {
+                    "total_words": 3,
+                    "toxic_words": 3,
+                    "mean_score": 0.856,
+                    "median_score": 0.865,
+                    "max_score": 0.924,
+                    "min_score": 0.756
+                },
+                "html_highlighted": "<div>...</div>"
+            }
+        }
+class HealthResponse(BaseModel):
+    """Health check response"""
+    status: str = Field(..., description="Service status")
+    model_loaded: bool = Field(..., description="Whether model is loaded")
+    device: str = Field(..., description="Device being used (cpu/cuda)")
+    model_name: str = Field(..., description="Model name")
+    version: str = Field(..., description="API version")
+class ErrorResponse(BaseModel):
+    """Error response"""
+    success: bool = Field(False, description="Always false for errors")
+    error: str = Field(..., description="Error message")
+    detail: Optional[str] = Field(None, description="Detailed error information")

app/services/__init__.py ADDED Viewed

File without changes

app/services/analysis_service.py ADDED Viewed

	@@ -0,0 +1,318 @@

+"""
+Analysis Service
+================
+Main analysis orchestrator (Dependency Inversion + Open/Closed)
+"""
+import numpy as np
+from typing import List, Dict
+from collections import defaultdict
+from app.models.model_loader import model_loader
+from app.services.text_processor import TextProcessor
+from app.services.gradient_service import GradientService
+from app.services.html_generator import HTMLGenerator
+from app.schemas.requests import AnalysisRequest
+from app.schemas.responses import (
+    AnalysisResponse, SentenceResult, WordScore,
+    ToxicWordSummary, Statistics, SentimentLabel
+)
+from app.core.config import settings
+from app.core.exceptions import AnalysisException
+class AnalysisService:
+    """
+    Main analysis service
+    Responsibilities:
+    - Orchestrate analysis pipeline
+    - Coordinate between services
+    - Build response
+    Dependencies:
+    - TextProcessor: Text processing
+    - GradientService: Gradient computation
+    - HTMLGenerator: HTML generation
+    """
+    def __init__(self):
+        self.text_processor = TextProcessor()
+        self.gradient_service = GradientService()
+        self.html_generator = HTMLGenerator()
+    def analyze(self, request: AnalysisRequest) -> AnalysisResponse:
+        """
+        Analyze text for toxicity
+        Args:
+            request: Analysis request
+        Returns:
+            Analysis response
+        Raises:
+            AnalysisException: If analysis fails
+        """
+        try:
+            # 1. Split into sentences
+            sentences = self.text_processor.split_into_sentences(request.text)
+            # 2. Analyze each sentence
+            sentence_results = []
+            for i, sent_info in enumerate(sentences, 1):
+                sent_result = self._analyze_sentence(
+                    sent_info,
+                    i,
+                    request.include_word_scores
+                )
+                sentence_results.append(sent_result)
+            # 3. Generate statistics
+            statistics = self._compute_statistics(sentence_results)
+            # 4. Extract toxic words summary
+            toxic_words_summary = self._extract_toxic_words_summary(sentence_results)
+            # 5. Generate HTML if requested
+            html_highlighted = None
+            if request.include_html:
+                html_highlighted = self.html_generator.generate_highlighted_html(
+                    request.text,
+                    [self._convert_to_dict(r) for r in sentence_results]
+                )
+            # 6. Determine overall label
+            toxic_count = sum(1 for r in sentence_results if r.label == SentimentLabel.TOXIC)
+            overall_label = SentimentLabel.TOXIC if toxic_count > 0 else SentimentLabel.CLEAN
+            # 7. Build response
+            return AnalysisResponse(
+                success=True,
+                text=request.text,
+                overall_label=overall_label,
+                toxic_sentence_count=toxic_count,
+                clean_sentence_count=len(sentences) - toxic_count,
+                total_sentences=len(sentences),
+                sentences=sentence_results,
+                toxic_words_summary=toxic_words_summary,
+                statistics=statistics,
+                html_highlighted=html_highlighted
+            )
+        except Exception as e:
+            raise AnalysisException(detail=str(e))
+    def _analyze_sentence(
+        self,
+        sent_info: Dict[str, any],
+        sent_number: int,
+        include_word_scores: bool
+    ) -> SentenceResult:
+        """Analyze single sentence"""
+        sent_text = sent_info['text']
+        # Extract words
+        words = self.text_processor.extract_words(sent_text)
+        if len(words) == 0:
+            return SentenceResult(
+                sentence_number=sent_number,
+                text=sent_text,
+                label=SentimentLabel.CLEAN,
+                confidence=0.0,
+                threshold=0.6,
+                word_count=0,
+                word_scores=[] if include_word_scores else None
+            )
+        # Tokenize
+        encoding = model_loader.tokenizer(
+            sent_text.lower().strip(),
+            add_special_tokens=True,
+            max_length=settings.MAX_LENGTH,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        )
+        # Compute gradients
+        gradient_scores, predicted_class, confidence = self.gradient_service.compute_integrated_gradients(
+            model=model_loader.model,
+            input_ids=encoding['input_ids'],
+            attention_mask=encoding['attention_mask'],
+            device=model_loader.device
+        )
+        # Get tokens
+        tokens = model_loader.tokenizer.convert_ids_to_tokens(
+            encoding['input_ids'][0].cpu().numpy()
+        )
+        valid_length = encoding['attention_mask'][0].sum().item()
+        tokens = tokens[:valid_length]
+        # Normalize gradients
+        gradient_scores_norm = self.gradient_service.normalize_scores(gradient_scores)
+        # Map to words
+        word_scores = self._map_tokens_to_words(tokens, gradient_scores_norm, words)
+        # Determine toxicity
+        is_toxic = (predicted_class == 1)
+        label = SentimentLabel.TOXIC if is_toxic else SentimentLabel.CLEAN
+        # Compute threshold
+        threshold = self.gradient_service.compute_threshold(word_scores, is_toxic)
+        # Build word scores
+        word_score_objects = None
+        if include_word_scores:
+            word_score_objects = []
+            for word_info, score in zip(words, word_scores):
+                word_score_objects.append(WordScore(
+                    word=word_info['word'],
+                    score=float(score),
+                    position={'start': word_info['start'], 'end': word_info['end']},
+                    is_toxic=score > threshold and not self.text_processor.is_stop_word(word_info['word']),
+                    is_stop_word=self.text_processor.is_stop_word(word_info['word'])
+                ))
+        return SentenceResult(
+            sentence_number=sent_number,
+            text=sent_text,
+            label=label,
+            confidence=float(confidence),
+            threshold=float(threshold),
+            word_count=len(words),
+            word_scores=word_score_objects
+        )
+    def _map_tokens_to_words(
+        self,
+        tokens: List[str],
+        token_scores: np.ndarray,
+        original_words: List[Dict[str, any]]
+    ) -> np.ndarray:
+        """Map token scores to words"""
+        clean_tokens = []
+        clean_scores = []
+        for token, score in zip(tokens, token_scores):
+            if token not in ['<s>', '</s>', '<pad>', '<unk>']:
+                clean_token = token.replace('_', '').replace('@@', '').strip()
+                if clean_token and not self.text_processor.is_punctuation(clean_token):
+                    clean_tokens.append(clean_token)
+                    clean_scores.append(score)
+        word_scores = []
+        token_idx = 0
+        for word_info in original_words:
+            word = word_info['word'].lower()
+            matching_scores = []
+            temp_idx = token_idx
+            accumulated = ""
+            while temp_idx < len(clean_tokens):
+                accumulated += clean_tokens[temp_idx]
+                matching_scores.append(clean_scores[temp_idx])
+                if accumulated == word:
+                    token_idx = temp_idx + 1
+                    break
+                elif len(accumulated) >= len(word):
+                    break
+                temp_idx += 1
+            word_scores.append(max(matching_scores) if matching_scores else 0.0)
+        return np.array(word_scores)
+    def _compute_statistics(self, sentence_results: List[SentenceResult]) -> Statistics:
+        """Compute overall statistics"""
+        all_scores = []
+        toxic_words_count = 0
+        for sent_result in sentence_results:
+            if sent_result.word_scores:
+                for ws in sent_result.word_scores:
+                    all_scores.append(ws.score)
+                    if ws.is_toxic:
+                        toxic_words_count += 1
+        if len(all_scores) == 0:
+            return Statistics(
+                total_words=0,
+                toxic_words=0,
+                mean_score=0.0,
+                median_score=0.0,
+                max_score=0.0,
+                min_score=0.0
+            )
+        all_scores = np.array(all_scores)
+        return Statistics(
+            total_words=len(all_scores),
+            toxic_words=toxic_words_count,
+            mean_score=float(np.mean(all_scores)),
+            median_score=float(np.median(all_scores)),
+            max_score=float(np.max(all_scores)),
+            min_score=float(np.min(all_scores))
+        )
+    def _extract_toxic_words_summary(
+        self,
+        sentence_results: List[SentenceResult]
+    ) -> List[ToxicWordSummary]:
+        """Extract summary of toxic words"""
+        toxic_words_dict = defaultdict(lambda: {
+            'max_score': 0.0,
+            'occurrences': 0,
+            'sentences': []
+        })
+        for sent_result in sentence_results:
+            if sent_result.word_scores:
+                for ws in sent_result.word_scores:
+                    if ws.is_toxic:
+                        word = ws.word
+                        toxic_words_dict[word]['max_score'] = max(
+                            toxic_words_dict[word]['max_score'],
+                            ws.score
+                        )
+                        toxic_words_dict[word]['occurrences'] += 1
+                        if sent_result.sentence_number not in toxic_words_dict[word]['sentences']:
+                            toxic_words_dict[word]['sentences'].append(sent_result.sentence_number)
+        # Convert to list and sort by score
+        toxic_words_summary = [
+            ToxicWordSummary(
+                word=word,
+                score=data['max_score'],
+                occurrences=data['occurrences'],
+                sentences=sorted(data['sentences'])
+            )
+            for word, data in toxic_words_dict.items()
+        ]
+        toxic_words_summary.sort(key=lambda x: x.score, reverse=True)
+        return toxic_words_summary
+    def _convert_to_dict(self, sent_result: SentenceResult) -> Dict[str, any]:
+        """Convert SentenceResult to dict for HTML generator"""
+        return {
+            'sent_start': sent_result.word_scores[0].position['start'] if sent_result.word_scores and len(sent_result.word_scores) > 0 else 0,
+            'sent_end': sent_result.word_scores[-1].position['end'] if sent_result.word_scores and len(sent_result.word_scores) > 0 else len(sent_result.text),
+            'is_toxic': sent_result.label == SentimentLabel.TOXIC,
+            'words': [{'word': ws.word, 'start': ws.position['start'], 'end': ws.position['end']} for ws in sent_result.word_scores] if sent_result.word_scores else [],
+            'scores': [ws.score for ws in sent_result.word_scores] if sent_result.word_scores else [],
+            'threshold': sent_result.threshold
+        }
+# Singleton instance
+analysis_service = AnalysisService()

app/services/gradient_service.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+Gradient Service
+================
+Gradient computation using Integrated Gradients (Single Responsibility)
+"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+from typing import Tuple
+from app.models.phobert_model import PhoBERTFineTuned
+from app.core.config import settings
+class GradientService:
+    """
+    Gradient computation service
+    Responsibilities:
+    - Compute integrated gradients
+    - Calculate importance scores
+    """
+    @staticmethod
+    def compute_integrated_gradients(
+        model: PhoBERTFineTuned,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        device: torch.device,
+        target_class: int | None = None,
+        steps: int | None = None
+    ) -> Tuple[np.ndarray, int, float]:
+        """
+        Compute integrated gradients
+        Args:
+            model: Model to analyze
+            input_ids: Input token IDs
+            attention_mask: Attention mask
+            device: Device
+            target_class: Target class (optional)
+            steps: Number of integration steps
+        Returns:
+            importance_scores: Token importance scores
+            predicted_class: Predicted class
+            confidence: Prediction confidence
+        """
+        if steps is None:
+            steps = settings.GRADIENT_STEPS
+        model.eval()
+        input_ids = input_ids.to(device)
+        attention_mask = attention_mask.to(device)
+        # Get original embeddings
+        with torch.no_grad():
+            outputs = model.embedding(input_ids, attention_mask=attention_mask)
+            original_hidden = outputs.last_hidden_state
+        baseline_hidden = torch.zeros_like(original_hidden)
+        integrated_grads = torch.zeros_like(original_hidden)
+        # Integrate gradients
+        for step in range(steps):
+            alpha = (step + 1) / steps
+            interpolated = baseline_hidden + alpha * (original_hidden - baseline_hidden)
+            interpolated = interpolated.detach().clone()
+            interpolated.requires_grad = True
+            # Forward pass through classification head
+            if model.pooling == 'cls':
+                pooled = interpolated[:, 0, :]
+            else:
+                mask_expanded = attention_mask.unsqueeze(-1).expand(interpolated.size()).float()
+                sum_embeddings = torch.sum(interpolated * mask_expanded, 1)
+                sum_mask = mask_expanded.sum(1)
+                pooled = sum_embeddings / sum_mask
+            pooled = model.layer_norm(pooled)
+            out = model.dropout(pooled)
+            out = model.relu(model.fc1(out))
+            out = model.dropout(out)
+            logits = model.fc2(out)
+            # Get prediction on first step
+            if step == 0:
+                probs = F.softmax(logits, dim=1)
+                predicted_class = torch.argmax(probs, dim=1).item()
+                confidence = probs[0, predicted_class].item()
+                if target_class is None:
+                    target_class = predicted_class
+            # Backward pass
+            model.zero_grad()
+            logits[0, target_class].backward()
+            integrated_grads += interpolated.grad
+        # Average and scale
+        integrated_grads = integrated_grads / steps
+        integrated_grads = integrated_grads * (original_hidden - baseline_hidden)
+        # Compute importance scores
+        importance_scores = torch.sum(torch.abs(integrated_grads), dim=-1)
+        importance_scores = importance_scores[0].cpu().detach().numpy()
+        valid_length = attention_mask[0].sum().item()
+        importance_scores = importance_scores[:valid_length]
+        return importance_scores, predicted_class, confidence
+    @staticmethod
+    def normalize_scores(scores: np.ndarray) -> np.ndarray:
+        """
+        Normalize scores to [0, 1]
+        Args:
+            scores: Raw scores
+        Returns:
+            Normalized scores
+        """
+        min_score = scores.min()
+        max_score = scores.max()
+        if max_score - min_score < 1e-8:
+            return np.ones_like(scores) * 0.5
+        return (scores - min_score) / (max_score - min_score)
+    @staticmethod
+    def compute_threshold(
+        scores: np.ndarray,
+        is_toxic: bool,
+        percentile: int | None = None
+    ) -> float:
+        """
+        Compute threshold for toxicity
+        Args:
+            scores: Word scores
+            is_toxic: Whether text is toxic
+            percentile: Percentile for threshold
+        Returns:
+            Threshold value
+        """
+        if percentile is None:
+            percentile = settings.PERCENTILE_THRESHOLD
+        if len(scores) == 0:
+            return 0.6
+        mean_score = np.mean(scores)
+        percentile_score = np.percentile(scores, percentile)
+        threshold = 0.6 * percentile_score + 0.4 * mean_score
+        if is_toxic:
+            threshold = max(threshold, 0.55)
+        else:
+            threshold = max(threshold, 0.75)
+        threshold = np.clip(threshold, 0.45, 0.90)
+        return float(threshold)

app/services/html_generator.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+HTML Generator
+==============
+Generate HTML highlighting (Single Responsibility)
+"""
+from typing import List, Dict
+from app.services.text_processor import TextProcessor
+class HTMLGenerator:
+    """
+    HTML generation service
+    Responsibilities:
+    - Generate HTML with highlighting
+    - Format toxic/clean sentences differently
+    """
+    @staticmethod
+    def generate_highlighted_html(
+        text: str,
+        sentence_results: List[Dict[str, any]]
+    ) -> str:
+        """
+        Generate HTML with highlighting
+        Args:
+            text: Original text
+            sentence_results: List of sentence analysis results
+        Returns:
+            HTML string with highlighting
+        """
+        html = '<div style="line-height: 2.2; font-size: 16px; font-family: Arial; max-width: 900px;">'
+        last_end = 0
+        for sent_data in sentence_results:
+            sent_start = sent_data['sent_start']
+            sent_end = sent_data['sent_end']
+            is_toxic = sent_data['is_toxic']
+            words = sent_data['words']
+            scores = sent_data['scores']
+            threshold = sent_data['threshold']
+            # Add space between sentences
+            if sent_start > last_end:
+                html += text[last_end:sent_start]
+            sent_text = text[sent_start:sent_end]
+            if is_toxic:
+                # Toxic sentence - highlight words
+                sent_html = HTMLGenerator._generate_toxic_sentence_html(
+                    sent_text, sent_start, words, scores, threshold
+                )
+                html += f'<span style="border-left: 3px solid #ff6b6b; padding-left: 8px; display: inline-block; margin: 4px 0;">{sent_html}</span>'
+            else:
+                # Clean sentence - plain text
+                html += f'<span style="color: #444;">{sent_text}</span>'
+            last_end = sent_end
+        # Add remaining text
+        if last_end < len(text):
+            html += text[last_end:]
+        html += '</div>'
+        return html
+    @staticmethod
+    def _generate_toxic_sentence_html(
+        sent_text: str,
+        sent_start: int,
+        words: List[Dict[str, any]],
+        scores: List[float],
+        threshold: float
+    ) -> str:
+        """
+        Generate HTML for toxic sentence
+        Args:
+            sent_text: Sentence text
+            sent_start: Sentence start position in full text
+            words: List of words
+            scores: Word scores
+            threshold: Toxicity threshold
+        Returns:
+            HTML string for sentence
+        """
+        sent_html = ""
+        char_idx = 0
+        word_idx = 0
+        while char_idx < len(sent_text):
+            if word_idx < len(words):
+                word_info = words[word_idx]
+                word_start_rel = word_info['start'] - sent_start
+                word_end_rel = word_info['end'] - sent_start
+                if char_idx == word_start_rel:
+                    word = word_info['word']
+                    score = scores[word_idx]
+                    if score > threshold and not TextProcessor.is_stop_word(word) and len(word) > 1:
+                        # Toxic word - red background
+                        color = int(255 * (1 - score))
+                        sent_html += (
+                            f'<span style="background-color: rgb(255, {color}, {color}); '
+                            f'padding: 2px 4px; margin: 0 1px; border-radius: 3px; '
+                            f'font-weight: bold;">{word}</span>'
+                        )
+                    else:
+                        # Non-toxic word
+                        if TextProcessor.is_stop_word(word):
+                            sent_html += f'<span style="color: #aaa; font-style: italic;">{word}</span>'
+                        else:
+                            sent_html += f'<span style="color: #333;">{word}</span>'
+                    char_idx = word_end_rel
+                    word_idx += 1
+                    continue
+            # Not at word - add character (punctuation, space, etc)
+            sent_html += sent_text[char_idx]
+            char_idx += 1
+        return sent_html

app/services/text_processor.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Text Processor
+==============
+Text processing utilities (Single Responsibility)
+"""
+import re
+from typing import List, Dict
+class TextProcessor:
+    """
+    Text processing service
+    Responsibilities:
+    - Split text into sentences
+    - Extract words from text
+    - Identify stop words
+    - Identify punctuation
+    """
+    STOP_WORDS = {
+        'này', 'kia', 'đó', 'ấy', 'nọ', 'đây', 'nào',
+        'các', 'những', 'mọi', 'cả',
+        'tôi', 'ta', 'mình', 'bạn', 'anh', 'chị', 'em',
+        'nó', 'họ', 'chúng', 'ai', 'gì',
+        'và', 'hoặc', 'nhưng', 'mà', 'nên', 'vì', 'nếu', 'thì', 'hay',
+        'rồi', 'còn', 'cũng', 'luôn', 'đều',
+        'thế', 'như',
+        'của', 'cho', 'với', 'từ', 'bởi', 'về', 'trong', 'ngoài',
+        'là', 'có', 'được', 'bị', 'ở', 'đang', 'sẽ', 'đã',
+        'thể', 'phải', 'nên', 'muốn', 'cần', 'biết',
+        'rất', 'quá', 'khá', 'hơi', 'vẫn', 'còn',
+        'chỉ', 'vừa', 'mới',
+        'đâu', 'sao',
+        'không', 'chẳng', 'chưa',
+        'nhiều', 'ít', 'vài', 'một',
+        'việc', 'chuyện', 'điều', 'lúc', 'khi',
+        'ra', 'vào', 'nhau', 'nhữ',
+        'vậy', 'ạ', 'nhé',
+    }
+    PUNCTUATION = set('.,!?;:()[]{}"\'-/\\@#$%^&*+=<>~`|')
+    @staticmethod
+    def split_into_sentences(text: str) -> List[Dict[str, any]]:
+        """
+        Split text into sentences
+        Args:
+            text: Input text
+        Returns:
+            List of sentences with positions
+        """
+        sentence_pattern = r'([.!?]+)\s*'
+        parts = re.split(sentence_pattern, text)
+        sentences = []
+        current_pos = 0
+        i = 0
+        while i < len(parts):
+            if not parts[i].strip():
+                current_pos += len(parts[i])
+                i += 1
+                continue
+            if not re.match(r'^[.!?]+$', parts[i]):
+                sentence_text = parts[i]
+                if i + 1 < len(parts) and re.match(r'^[.!?]+$', parts[i + 1]):
+                    sentence_text += parts[i + 1]
+                    i += 2
+                else:
+                    i += 1
+                if sentence_text.strip():
+                    sentences.append({
+                        'text': sentence_text,
+                        'start': current_pos,
+                        'end': current_pos + len(sentence_text)
+                    })
+                current_pos += len(sentence_text)
+            else:
+                current_pos += len(parts[i])
+                i += 1
+        if len(sentences) == 0:
+            sentences.append({'text': text, 'start': 0, 'end': len(text)})
+        return sentences
+    @staticmethod
+    def extract_words(text: str) -> List[Dict[str, any]]:
+        """
+        Extract words from text
+        Args:
+            text: Input text
+        Returns:
+            List of words with positions
+        """
+        pattern = r'[a-zA-Zàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ_]+'
+        words = []
+        for match in re.finditer(pattern, text, re.IGNORECASE):
+            words.append({
+                'word': match.group(),
+                'start': match.start(),
+                'end': match.end()
+            })
+        return words
+    @classmethod
+    def is_stop_word(cls, word: str) -> bool:
+        """
+        Check if word is a stop word
+        Args:
+            word: Word to check
+        Returns:
+            True if stop word
+        """
+        return word.lower().strip() in cls.STOP_WORDS
+    @classmethod
+    def is_punctuation(cls, token: str) -> bool:
+        """
+        Check if token is punctuation
+        Args:
+            token: Token to check
+        Returns:
+            True if punctuation
+        """
+        return not token or all(c in cls.PUNCTUATION for c in token)

models/PhoBERTFineTuned_best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fb4d10f4754fe5c7d45d992ea7c0461f5e4e9fffc6a66fb96ed66ccddb90618
+size 540876678

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+pydantic==2.5.0
+pydantic-settings==2.1.0
+python-multipart==0.0.6
+torch==2.1.0
+transformers==4.35.0
+numpy==1.24.3
+python-dotenv==1.0.0