Spaces:

Sp2503
/

MuriL-2.0

Runtime error

App Files Files Community

Sai809701 commited on Nov 7

Commit

ce552a1

1 Parent(s): 3b2edfc

muril api

Browse files

Files changed (4) hide show

Dockerfile +25 -0
embed_build.py +114 -0
main.py +221 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+# Dockerfile.muril
+FROM python:3.11-slim
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential git curl ca-certificates \
+ && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements_api.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+# Copy app
+COPY main.py /app/main.py
+ENV HF_HOME=/app/hf_cache
+ENV TRANSFORMERS_CACHE=/app/hf_cache
+ENV TORCH_DISABLE_CUDA=1
+ENV OUT_DIR=/app/export_artifacts
+ENV MODEL_DIR=/app/muril_multilang_out
+ENV PORT=7860
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

embed_build.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# embed_build_muril.py
+"""
+Produce answer embeddings for the dataset using a fine-tuned MuRIL model.
+Saves:
+ - muril_multilingual_dataset.csv  (columns: question, answer, language)
+ - answer_embeddings.pt            (torch tensor shape [N, D], float32, on CPU)
+Usage:
+ python embed_build_muril.py \
+    --model_dir ./muril_multilang_out \
+    --input_jsonl /path/to/legal_multilingual_QA_10k.jsonl \
+    --out_dir ./export_artifacts \
+    --batch_size 64
+"""
+import argparse, os, math
+from pathlib import Path
+import torch
+import pandas as pd
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, AutoModel
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--model_dir", type=str, default="./muril_multilang_out", help="Path or HF repo id of fine-tuned MuRIL")
+    p.add_argument("--input_jsonl", type=str, required=True, help="Path to legal_multilingual_QA_10k.jsonl")
+    p.add_argument("--out_dir", type=str, default="./export_artifacts")
+    p.add_argument("--langs", type=str, default="en,hi,mr,ta,bn,gu,kn,ml,pa,or,as,ur,sa,ne", help="comma-separated languages to merge (will stack)")
+    p.add_argument("--text_prefix", type=str, default="question_", help="prefix for question columns in JSONL")
+    p.add_argument("--answer_col_prefix", type=str, default="answer_", help="prefix for answer columns if present (not used here)")
+    p.add_argument("--batch_size", type=int, default=64)
+    p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
+    return p.parse_args()
+def mean_pooling(last_hidden_state, attention_mask):
+    # last_hidden_state: (B, L, H)
+    # attention_mask: (B, L)
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask
+def build_question_answer_rows(df, langs, text_prefix):
+    rows = []
+    for _, r in df.iterrows():
+        # merge all available language question/answer pairs by stacking
+        for lang in langs:
+            qcol = f"{text_prefix}{lang}"
+            acol = f"answer_{lang}"
+            # If dataset uses question_<lang> and answer_<lang>, use them; otherwise fall back to question_<lang> and common 'answer' field.
+            q = r.get(qcol, None)
+            if q is None or str(q).strip() == "" or str(q).lower() == "nan":
+                continue
+            # pick answer_<lang> if present else "answer" column
+            if acol in df.columns and pd.notna(r.get(acol)):
+                a = r.get(acol)
+            else:
+                a = r.get("answer", None)
+            if a is None or str(a).strip() == "" or str(a).lower() == "nan":
+                continue
+            rows.append({"question": str(q).strip(), "answer": str(a).strip(), "language": lang})
+    return pd.DataFrame(rows)
+def main():
+    args = parse_args()
+    os.makedirs(args.out_dir, exist_ok=True)
+    # load JSONL to pandas
+    print("Loading dataset:", args.input_jsonl)
+    df_in = pd.read_json(args.input_jsonl, lines=True, dtype=str)
+    # Build rows stacked across languages (question_<lang>, answer optional)
+    langs = [l.strip() for l in args.langs.split(",") if l.strip()]
+    print("Merging language columns (stack)... langs:", langs)
+    rows_df = build_question_answer_rows(df_in, langs, args.text_prefix)
+    if rows_df.empty:
+        raise SystemExit("No question/answer rows found after merging languages. Check your columns.")
+    print(f"Total rows extracted: {len(rows_df)}")
+    # Save CSV (order matters)
+    csv_path = Path(args.out_dir) / "muril_multilingual_dataset.csv"
+    rows_df.to_csv(csv_path, index=False, encoding="utf-8")
+    print("Saved merged CSV to:", csv_path)
+    # Load model & tokenizer
+    print("Loading tokenizer & model from:", args.model_dir, "device:", args.device)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_dir, use_fast=True)
+    model = AutoModel.from_pretrained(args.model_dir)
+    model.to(args.device)
+    model.eval()
+    # Encode answers in batches
+    answers = rows_df["answer"].astype(str).tolist()
+    batch_size = int(args.batch_size)
+    all_embs = []
+    with torch.inference_mode():
+        for i in tqdm(range(0, len(answers), batch_size), desc="Encoding"):
+            batch_texts = answers[i:i+batch_size]
+            encoded = tokenizer(batch_texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
+            input_ids = encoded["input_ids"].to(args.device)
+            attention_mask = encoded["attention_mask"].to(args.device)
+            out = model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
+            last_hidden = out.last_hidden_state  # (B, L, H)
+            pooled = mean_pooling(last_hidden, attention_mask)  # (B, H)
+            # L2-normalize embeddings (optional but recommended for cosine similarity)
+            pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
+            all_embs.append(pooled.cpu())
+    all_embs = torch.cat(all_embs, dim=0)  # (N, H)
+    print("Embeddings shape:", all_embs.shape)
+    embed_path = Path(args.out_dir) / "answer_embeddings.pt"
+    torch.save(all_embs, embed_path)
+    print("Saved embeddings to:", embed_path)
+    print("Done. Artifacts in:", args.out_dir)
+if __name__ == "__main__":
+    main()

main.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# main.py
+import os
+import sys
+import torch
+import pandas as pd
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List, Optional
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer, AutoModel
+import numpy as np
+# -----------------------
+# Configuration (env)
+# -----------------------
+# Repo containing artifacts (CSV, embeddings, optionally model weights)
+HF_REPO = os.getenv(
+    "HF_REPO",
+    "Sp2503/Finetuned-multilingualdataset-MuriL-model"
+)
+# Where snapshot_download will cache/put the files
+CACHE_DIR = os.getenv("HF_CACHE_DIR", "/tmp/hf_artifacts")
+# Names expected inside the repo
+CSV_FILENAME = os.getenv("CSV_FILENAME", "muril_multilingual_dataset.csv")
+EMBED_FILENAME = os.getenv("EMBED_FILENAME", "answer_embeddings.pt")
+# If you stored a fine-tuned model in the same repo, set this to repo path (optional)
+MODEL_SUBDIR = os.getenv("MODEL_SUBDIR", "")  # leave empty if model is in root
+# retrieval config
+TOP_K = int(os.getenv("TOP_K", "1"))
+DEVICE = "cuda" if torch.cuda.is_available() and os.getenv("TORCH_DISABLE_CUDA", "0") != "1" else "cpu"
+# -----------------------
+# Utility helpers
+# -----------------------
+def download_and_verify(repo_id: str, repo_type: str = "model", cache_dir: str = CACHE_DIR):
+    """
+    Download repo snapshot and verify that CSV + embeddings exist.
+    Returns absolute paths: model_dir, csv_path, embed_path
+    """
+    print(f"🔁 snapshot_download: repo_id={repo_id} cache_dir={cache_dir}")
+    try:
+        model_dir = snapshot_download(repo_id=repo_id, repo_type=repo_type, cache_dir=cache_dir)
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed to snapshot_download repo {repo_id}. "
+            "Make sure HUGGINGFACE_HUB_TOKEN (or HF_TOKEN) is set for private repos and that repo exists."
+        ) from e
+    csv_path = os.path.join(model_dir, CSV_FILENAME)
+    embed_path = os.path.join(model_dir, EMBED_FILENAME)
+    model_path = os.path.join(model_dir, MODEL_SUBDIR) if MODEL_SUBDIR else model_dir
+    # checks
+    missing = []
+    if not os.path.isfile(csv_path):
+        missing.append(csv_path)
+    if not os.path.isfile(embed_path):
+        missing.append(embed_path)
+    if missing:
+        raise FileNotFoundError(
+            f"Missing artifact(s) in the downloaded repo: {missing}\n"
+            f"Push {CSV_FILENAME} and {EMBED_FILENAME} to the repo '{repo_id}' or set HF_REPO to the correct one."
+        )
+    print(f"✅ Downloaded snapshot to: {model_dir}")
+    print(f"✅ Found CSV at: {csv_path}")
+    print(f"✅ Found embeddings at: {embed_path}")
+    return model_path, csv_path, embed_path
+def load_embeddings(emb_path: str, csv_len_expected: Optional[int] = None):
+    emb = torch.load(emb_path, map_location="cpu")
+    if not isinstance(emb, torch.Tensor):
+        raise ValueError(f"Embeddings file {emb_path} did not load a torch.Tensor (type={type(emb)}).")
+    if emb.ndim != 2:
+        raise ValueError(f"Embeddings tensor must have shape [N, D]. Got {tuple(emb.shape)}.")
+    if csv_len_expected is not None and emb.shape[0] != csv_len_expected:
+        raise ValueError(f"Mismatch: CSV rows={csv_len_expected} but embeddings rows={emb.shape[0]}. Ensure ordering matches.")
+    return emb
+def mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor):
+    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+    summed = torch.sum(last_hidden_state * mask, dim=1)
+    denom = torch.clamp(mask.sum(dim=1), min=1e-9)
+    return summed / denom
+# -----------------------
+# Startup: download artifacts
+# -----------------------
+try:
+    MODEL_DIR, CSV_PATH, EMBED_PATH = download_and_verify(HF_REPO, repo_type="model", cache_dir=CACHE_DIR)
+except Exception as e:
+    print("✖ ERROR during snapshot_download or verification:", e, file=sys.stderr)
+    raise
+# -----------------------
+# Load CSV and embeddings
+# -----------------------
+print("📥 Loading CSV:", CSV_PATH)
+df = pd.read_csv(CSV_PATH, dtype=str).fillna("")
+# Ensure required columns
+if not {"question", "answer"}.issubset(set(df.columns)):
+    raise RuntimeError(f"CSV must contain 'question' and 'answer' columns. Found: {df.columns.tolist()}")
+# Default language column if missing
+if "language" not in df.columns:
+    df["language"] = "en"
+print("📥 Loading embeddings (this may take a second)...")
+answer_embeddings = load_embeddings(EMBED_PATH, csv_len_expected=len(df))
+# Ensure embeddings are float32 and normalized (we normalize incoming queries; if embeddings not normalized, we normalize here)
+if answer_embeddings.dtype != torch.float32:
+    answer_embeddings = answer_embeddings.to(torch.float32)
+# Normalize stored embeddings for dot-product = cosine
+answer_embeddings = torch.nn.functional.normalize(answer_embeddings, p=2, dim=1)
+print("✅ Loaded dataset rows:", len(df), "embedding dim:", answer_embeddings.shape[1])
+# -----------------------
+# Load MuRIL model for query encoding (AutoModel + tokenizer)
+# -----------------------
+print("⚙️ Loading tokenizer & model for text encoding from:", MODEL_DIR)
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
+    encoder_model = AutoModel.from_pretrained(MODEL_DIR)
+    encoder_model.to(DEVICE)
+    encoder_model.eval()
+except Exception as e:
+    # fail fast with helpful message
+    raise RuntimeError(
+        f"Failed to load model/tokenizer from {MODEL_DIR}. "
+        "If the repo stores only adapters/LoRA, you must load base model + apply adapters. "
+        "Ensure the repo contains full model files or set MODEL_SUBDIR appropriately."
+    ) from e
+def encode_query(texts: List[str], batch_size: int = 32):
+    all_embs = []
+    with torch.inference_mode():
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i : i + batch_size]
+            enc = tokenizer(batch, padding=True, truncation=True, max_length=256, return_tensors="pt")
+            input_ids = enc["input_ids"].to(DEVICE)
+            attention_mask = enc["attention_mask"].to(DEVICE)
+            out = encoder_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
+            pooled = mean_pool(out.last_hidden_state, attention_mask)
+            pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
+            all_embs.append(pooled.cpu())
+    return torch.cat(all_embs, dim=0)
+# -----------------------
+# FastAPI app
+# -----------------------
+app = FastAPI(title="MuRIL Multilingual QA API (Hub-backed artifacts)")
+class QueryRequest(BaseModel):
+    question: str
+    lang: Optional[str] = None
+class QAResponse(BaseModel):
+    answer: str
+    detected_lang: str
+    top_k: Optional[List[dict]] = None
+@app.get("/")
+def root():
+    langs = sorted(df["language"].unique().tolist())
+    return {
+        "status": "✅ MuRIL Multilingual QA API Running",
+        "available_languages": langs,
+        "model_repo": HF_REPO,
+        "loaded_rows": len(df),
+    }
+@app.post("/get-answer", response_model=QAResponse)
+def get_answer_endpoint(req: QueryRequest):
+    qtext = (req.question or "").strip()
+    if not qtext:
+        raise HTTPException(status_code=400, detail="Empty question")
+    lang_filter = (req.lang or "").strip()
+    filtered_df = df
+    filtered_embeddings = answer_embeddings
+    if lang_filter:
+        mask = df["language"] == lang_filter
+        if not mask.any():
+            # no data for this language
+            return QAResponse(answer=f"⚠️ No data found for language '{lang_filter}'.", detected_lang=lang_filter)
+        filtered_df = df[mask].reset_index(drop=True)
+        filtered_embeddings = answer_embeddings[mask.values]
+    # encode query
+    q_emb = encode_query([qtext], batch_size=1)  # shape (1, D)
+    sims = torch.matmul(q_emb, filtered_embeddings.T).squeeze(0)  # (N,)
+    k = max(1, min(TOP_K, len(filtered_df)))
+    topv, topi = torch.topk(sims, k=k)
+    best_idx = int(topi[0].item())
+    answer = filtered_df.iloc[best_idx]["answer"]
+    top_k_payload = None
+    if k > 1:
+        top_k_payload = []
+        for rank in range(k):
+            idx = int(topi[rank].item())
+            top_k_payload.append({
+                "rank": rank + 1,
+                "score": float(topv[rank].item()),
+                "answer": filtered_df.iloc[idx]["answer"],
+                "question": filtered_df.iloc[idx]["question"],
+            })
+    return QAResponse(answer=answer, detected_lang=lang_filter or "all", top_k=top_k_payload)
+# -----------------------
+# Run server (if invoked directly)
+# -----------------------
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=int(os.getenv("PORT", "7860")), workers=1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi==0.115.5
+uvicorn[standard]==0.32.0
+pandas==2.2.3
+torch>=2.1.0
+transformers==4.46.0
+huggingface_hub>=0.14.1
+tqdm