Sai809701 commited on
Commit
6677176
·
1 Parent(s): b46441b

updated files

Browse files
Files changed (5) hide show
  1. Dockerfile +45 -15
  2. entrypoint.sh +38 -0
  3. model_loader.py +78 -0
  4. regenerate_embeddings.py +125 -0
  5. requirements.txt +2 -0
Dockerfile CHANGED
@@ -1,25 +1,55 @@
1
- # Dockerfile.muril
2
  FROM python:3.11-slim
3
 
4
  ENV DEBIAN_FRONTEND=noninteractive
5
- RUN apt-get update && apt-get install -y --no-install-recommends \
6
- build-essential git curl ca-certificates \
7
- && rm -rf /var/lib/apt/lists/*
8
-
9
  WORKDIR /app
10
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  COPY requirements.txt /app/requirements.txt
12
- RUN pip install --no-cache-dir -r /app/requirements.txt
13
 
14
- # Copy app
15
- COPY main.py /app/main.py
 
 
 
 
 
 
16
 
17
- ENV HF_HOME=/app/hf_cache
18
- ENV TRANSFORMERS_CACHE=/app/hf_cache
19
- ENV TORCH_DISABLE_CUDA=1
20
- ENV OUT_DIR=/app/export_artifacts
21
- ENV MODEL_DIR=/app/muril_multilang_out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  ENV PORT=7860
 
 
 
23
 
24
- EXPOSE 7860
25
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Dockerfile — CPU-optimized, robust for Hugging Face Spaces / local deployment
2
  FROM python:3.11-slim
3
 
4
  ENV DEBIAN_FRONTEND=noninteractive
 
 
 
 
5
  WORKDIR /app
6
 
7
+ # ---------------------------
8
+ # System deps
9
+ # ---------------------------
10
+ RUN apt-get update \
11
+ && apt-get install -y --no-install-recommends \
12
+ build-essential git curl ca-certificates libsndfile1 \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # ---------------------------
16
+ # Copy requirements files
17
+ # ---------------------------
18
+ # We'll install torch (CPU) first from the official PyTorch index, then install other requirements.
19
  COPY requirements.txt /app/requirements.txt
 
20
 
21
+ # ---------------------------
22
+ # Install PyTorch CPU wheel first to avoid torchvision/torch mismatch
23
+ # (adjust CPU wheel index if you want GPU/CUDA variant)
24
+ # ---------------------------
25
+ RUN pip --no-cache-dir install --upgrade pip setuptools wheel \
26
+ && pip --no-cache-dir install --index-url https://download.pytorch.org/whl/cpu \
27
+ "torch>=2.1.0" "torchvision>=0.16.0" "torchaudio>=2.1.0" \
28
+ && pip --no-cache-dir install --no-deps -r /app/requirements.txt
29
 
30
+ # ---------------------------
31
+ # Copy app code
32
+ # ---------------------------
33
+ COPY . /app
34
+
35
+ # Make entrypoint executable
36
+ RUN chmod +x /app/entrypoint.sh || true
37
+
38
+ # Create export artifacts dir (mountable)
39
+ RUN mkdir -p /app/export_artifacts
40
+
41
+ # Default envs (override at runtime)
42
+ ENV HF_REPO="Sp2503/Finetuned-multilingualdataset-MuriL-model"
43
+ ENV MODEL_DIR="$HF_REPO"
44
+ ENV CSV_PATH="/app/export_artifacts/muril_multilingual_dataset.csv"
45
+ ENV OUT_EMBED_PATH="/app/export_artifacts/answer_embeddings.pt"
46
+ ENV HF_CACHE_DIR="/app/hf_cache"
47
+ ENV UPLOAD_BACK="false"
48
+ ENV FORCE_REGEN="false"
49
  ENV PORT=7860
50
+ ENV DEVICE="cpu"
51
+
52
+ EXPOSE ${PORT}
53
 
54
+ # ENTRYPOINT: run regeneration (if required) and start uvicorn
55
+ ENTRYPOINT ["/app/entrypoint.sh"]
entrypoint.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ CSV_PATH=${CSV_PATH:-/app/export_artifacts/muril_multilingual_dataset.csv}
5
+ EMBED_PATH=${OUT_EMBED_PATH:-/app/export_artifacts/answer_embeddings.pt}
6
+ MODEL_DIR=${MODEL_DIR:-$HF_REPO}
7
+ HF_REPO=${HF_REPO:-Sp2503/Finetuned-multilingualdataset-MuriL-model}
8
+ FORCE_REGEN=${FORCE_REGEN:-false}
9
+ UPLOAD_BACK=${UPLOAD_BACK:-false}
10
+
11
+ echo "Entrypoint: CSV_PATH=$CSV_PATH EMBED_PATH=$EMBED_PATH MODEL_DIR=$MODEL_DIR FORCE_REGEN=$FORCE_REGEN"
12
+
13
+ mkdir -p "$(dirname "$EMBED_PATH")"
14
+
15
+ need_regen=false
16
+ if [ ! -f "$EMBED_PATH" ]; then
17
+ echo "Embeddings not found -> will regenerate."
18
+ need_regen=true
19
+ fi
20
+
21
+ if [ "$FORCE_REGEN" = "true" ] || [ "$FORCE_REGEN" = "1" ]; then
22
+ echo "FORCE_REGEN -> will regenerate embeddings."
23
+ need_regen=true
24
+ fi
25
+
26
+ if [ "$need_regen" = "true" ]; then
27
+ export MODEL_DIR="${MODEL_DIR}"
28
+ export CSV_PATH="${CSV_PATH}"
29
+ export OUT_EMBED_PATH="${EMBED_PATH}"
30
+ export HF_REPO="${HF_REPO}"
31
+ export UPLOAD_BACK="${UPLOAD_BACK}"
32
+ python /app/regenerate_embeddings.py
33
+ echo "Regeneration finished."
34
+ else
35
+ echo "Skipping regeneration."
36
+ fi
37
+
38
+ exec uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}
model_loader.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model_loader.py
2
+ import os
3
+ from pathlib import Path
4
+ import torch
5
+
6
+ def load_model_and_tokenizer(model_repo_dir_or_local_path: str, base_model_id: str = "google/muril-base-cased", device: str = None):
7
+ """
8
+ Robust loader that:
9
+ - loads full model if model.safetensors or pytorch_model.bin exists in model_repo_dir_or_local_path
10
+ - otherwise loads base_model_id then applies PEFT adapter from adapter_model.safetensors (if present)
11
+ Returns (tokenizer, model_on_device, backend_str)
12
+ """
13
+ device = device or ("cuda" if torch.cuda.is_available() else "cpu")
14
+ model_dir = Path(model_repo_dir_or_local_path)
15
+
16
+ # prefer safetensors full model if present
17
+ full_model_files = ["model.safetensors", "pytorch_model.bin"]
18
+ adapter_files = ["adapter_model.safetensors", "adapter_config.json", "adapter.safetensors"]
19
+
20
+ # Import delayed to allow environment to control package errors
21
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
22
+
23
+ tokenizer = None
24
+ model = None
25
+
26
+ # 1) Try to load full model from model_dir
27
+ for f in full_model_files:
28
+ full_path = model_dir / f
29
+ if full_path.exists():
30
+ tokenizer = AutoTokenizer.from_pretrained(str(model_dir), use_fast=True)
31
+ try:
32
+ model = AutoModel.from_pretrained(str(model_dir), trust_remote_code=False)
33
+ backend = "full-AutoModel"
34
+ except Exception:
35
+ model = AutoModelForSequenceClassification.from_pretrained(str(model_dir), trust_remote_code=False)
36
+ backend = "full-AutoModelForSequenceClassification"
37
+ model.to(device)
38
+ model.eval()
39
+ return tokenizer, model, backend
40
+
41
+ # 2) If full model not present, check for adapter files
42
+ has_adapter = any((model_dir / af).exists() for af in adapter_files)
43
+ if has_adapter:
44
+ try:
45
+ tokenizer = AutoTokenizer.from_pretrained(str(model_dir), use_fast=True)
46
+ except Exception:
47
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)
48
+
49
+ try:
50
+ base = AutoModel.from_pretrained(str(model_dir))
51
+ base_name = str(model_dir)
52
+ except Exception:
53
+ base = AutoModel.from_pretrained(base_model_id)
54
+ base_name = base_model_id
55
+
56
+ base.to(device)
57
+ base.eval()
58
+
59
+ try:
60
+ from peft import PeftModel
61
+ adapter_dir = str(model_dir)
62
+ # If adapter files exist, PeftModel.from_pretrained will pick them up
63
+ peft_model = PeftModel.from_pretrained(base, adapter_dir, is_trainable=False)
64
+ peft_model.to(device)
65
+ peft_model.eval()
66
+ return tokenizer, peft_model, "peft-attached"
67
+ except Exception as e:
68
+ raise RuntimeError(f"Failed to load/apply PEFT adapter from {model_dir}: {e}") from e
69
+
70
+ # 3) Fallback: try direct load (may fail)
71
+ try:
72
+ tokenizer = AutoTokenizer.from_pretrained(str(model_dir), use_fast=True)
73
+ model = AutoModel.from_pretrained(str(model_dir))
74
+ model.to(device)
75
+ model.eval()
76
+ return tokenizer, model, "auto-fallback"
77
+ except Exception as e:
78
+ raise RuntimeError(f"Unable to load model or adapters from {model_dir}. Error: {e}") from e
regenerate_embeddings.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # regenerate_embeddings.py
2
+ """
3
+ Regenerate answer embeddings using the MuRIL model.
4
+ This script:
5
+ - downloads model (if MODEL_DIR is a repo id),
6
+ - reads CSV at CSV_PATH,
7
+ - computes mean-pooled, L2-normalized embeddings for 'answer' column,
8
+ - saves embeddings to OUT_EMBED_PATH.
9
+
10
+ Exit codes:
11
+ - 0 on success
12
+ - non-zero on failure
13
+ """
14
+ import os, argparse, math, sys
15
+ from pathlib import Path
16
+ import torch
17
+ import pandas as pd
18
+ from tqdm.auto import tqdm
19
+ from transformers import AutoTokenizer, AutoModel
20
+ from huggingface_hub import snapshot_download
21
+
22
+ def mean_pooling(last_hidden_state, attention_mask):
23
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
24
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
25
+ sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
26
+ return sum_embeddings / sum_mask
27
+
28
+ def parse_env():
29
+ # ENV-friendly arg parsing
30
+ cfg = {}
31
+ cfg['model_dir'] = os.getenv("MODEL_DIR", os.getenv("HF_REPO", "Sp2503/Finetuned-multilingualdataset-MuriL-model"))
32
+ cfg['csv_path'] = os.getenv("CSV_PATH", "/app/export_artifacts/muril_multilingual_dataset.csv")
33
+ cfg['out_path'] = os.getenv("OUT_EMBED_PATH", "/app/export_artifacts/answer_embeddings.pt")
34
+ cfg['batch_size'] = int(os.getenv("EMBED_BATCH_SIZE", "64"))
35
+ cfg['device'] = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
36
+ cfg['download_cache'] = os.getenv("HF_CACHE_DIR", "/tmp/hf_cache")
37
+ cfg['upload_back'] = os.getenv("UPLOAD_BACK", "false").lower() in ("1","true","yes")
38
+ cfg['hf_repo'] = os.getenv("HF_REPO", None) # used for upload_back if set
39
+ return cfg
40
+
41
+ def main():
42
+ cfg = parse_env()
43
+ print("Regenerate embeddings with config:", cfg)
44
+ model_dir = cfg['model_dir']
45
+ # If model_dir looks like a HF repo id (contains '/'), snapshot_download to local cache
46
+ if "/" in model_dir and not os.path.isdir(model_dir):
47
+ print("Detected HF repo id for model. snapshot_download ->", cfg['download_cache'])
48
+ try:
49
+ model_dir = snapshot_download(repo_id=cfg['model_dir'], repo_type="model", cache_dir=cfg['download_cache'])
50
+ print("Downloaded model to:", model_dir)
51
+ except Exception as e:
52
+ print("Failed to snapshot_download model:", e, file=sys.stderr)
53
+ sys.exit(2)
54
+
55
+ csv_path = cfg['csv_path']
56
+ out_path = cfg['out_path']
57
+ batch_size = cfg['batch_size']
58
+ device = cfg['device']
59
+ print(f"Loading CSV: {csv_path}")
60
+ if not os.path.isfile(csv_path):
61
+ print(f"CSV not found at {csv_path}", file=sys.stderr)
62
+ sys.exit(3)
63
+ df = pd.read_csv(csv_path, dtype=str).fillna("")
64
+ if 'answer' not in df.columns:
65
+ print("CSV must contain 'answer' column", file=sys.stderr)
66
+ sys.exit(4)
67
+ answers = df['answer'].astype(str).tolist()
68
+ print(f"Encoding {len(answers)} answers on device {device} (batch_size={batch_size})")
69
+
70
+ # Load tokenizer & model
71
+ try:
72
+ tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
73
+ model = AutoModel.from_pretrained(model_dir)
74
+ model.to(device)
75
+ model.eval()
76
+ except Exception as e:
77
+ print("Failed to load model/tokenizer:", e, file=sys.stderr)
78
+ sys.exit(5)
79
+
80
+ # compute embeddings
81
+ all_embs = []
82
+ try:
83
+ with torch.inference_mode():
84
+ for i in tqdm(range(0, len(answers), batch_size), desc="Batches"):
85
+ batch = answers[i:i+batch_size]
86
+ enc = tokenizer(batch, padding=True, truncation=True, max_length=256, return_tensors="pt")
87
+ input_ids = enc["input_ids"].to(device)
88
+ attention_mask = enc["attention_mask"].to(device)
89
+ out = model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
90
+ pooled = mean_pooling(out.last_hidden_state, attention_mask) # (B, H)
91
+ pooled = torch.nn.functional.normalize(pooled, p=2, dim=1) # L2-normalize
92
+ all_embs.append(pooled.cpu())
93
+ except Exception as e:
94
+ print("Error during encoding:", e, file=sys.stderr)
95
+ sys.exit(6)
96
+
97
+ all_embs = torch.cat(all_embs, dim=0)
98
+ print("Final embeddings shape:", all_embs.shape)
99
+ Path(out_path).parent.mkdir(parents=True, exist_ok=True)
100
+ torch.save(all_embs, out_path)
101
+ print("Saved embeddings to:", out_path)
102
+
103
+ # Optional: upload back to HF repo (requires HF_TOKEN set and HF_REPO)
104
+ if cfg['upload_back'] and cfg['hf_repo']:
105
+ try:
106
+ from huggingface_hub import HfApi
107
+ api = HfApi()
108
+ print(f"Uploading {out_path} back to repo {cfg['hf_repo']} ...")
109
+ api.upload_file(
110
+ path_or_fileobj=out_path,
111
+ path_in_repo=os.path.basename(out_path),
112
+ repo_id=cfg['hf_repo'],
113
+ repo_type="model",
114
+ )
115
+ print("Upload complete.")
116
+ except Exception as e:
117
+ print("Upload back failed:", e, file=sys.stderr)
118
+
119
+ # quick sanity check
120
+ norms = (all_embs * all_embs).sum(dim=1)
121
+ print("Sample norms (should be ~1.0):", norms[:5].tolist())
122
+ return 0
123
+
124
+ if __name__ == "__main__":
125
+ sys.exit(main())
requirements.txt CHANGED
@@ -5,3 +5,5 @@ torch>=2.1.0
5
  transformers==4.46.0
6
  huggingface_hub>=0.14.1
7
  tqdm
 
 
 
5
  transformers==4.46.0
6
  huggingface_hub>=0.14.1
7
  tqdm
8
+ peft
9
+ safetensors