|
|
import json |
|
|
import os |
|
|
from typing import Dict, List |
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID_MAP = { |
|
|
|
|
|
"Qwen2.5-0.5B-Instruct": "Qwen/Qwen2.5-0.5B-Instruct", |
|
|
"Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct", |
|
|
"Qwen2.5-3B-Instruct": "Qwen/Qwen2.5-3B-Instruct", |
|
|
"Qwen2.5-7B-Instruct": "Qwen/Qwen2.5-7B-Instruct", |
|
|
"Qwen3-0.6B": "Qwen/Qwen3-0.6B", |
|
|
"Qwen3-1.7B": "Qwen/Qwen3-1.7B", |
|
|
"Qwen3-4B": "Qwen/Qwen3-4B", |
|
|
"Qwen3-8B": "Qwen/Qwen3-8B", |
|
|
|
|
|
"gemma-3-270m": "google/gemma-3-270m", |
|
|
"gemma-3-1b-it": "google/gemma-3-1b-it", |
|
|
"gemma-3-4b-it": "google/gemma-3-4b-it", |
|
|
|
|
|
"Llama-3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct", |
|
|
"Llama-3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct", |
|
|
"Llama-3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct", |
|
|
} |
|
|
|
|
|
|
|
|
TASK_KEYS = [ |
|
|
"KyrgyzMMLU", |
|
|
"KyrgyzRC", |
|
|
"WinoGrande", |
|
|
"BoolQ", |
|
|
"HellaSwag", |
|
|
"GSM8K", |
|
|
"TruthfulQA", |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
ZERO_SHOT: Dict[str, Dict[str, float]] = { |
|
|
|
|
|
"Qwen2.5-0.5B-Instruct": {"KyrgyzMMLU": 27.4, "KyrgyzRC": 53.2, "WinoGrande": 51.5, "BoolQ": 37.9, "HellaSwag": 14.6, "GSM8K": 0.7, "TruthfulQA": 33.5}, |
|
|
"Qwen2.5-1.5B-Instruct": {"KyrgyzMMLU": 27.9, "KyrgyzRC": 60.5, "WinoGrande": 50.1, "BoolQ": 38.6, "HellaSwag": 22.9, "GSM8K": 0.7, "TruthfulQA": 32.5}, |
|
|
"Qwen2.5-3B-Instruct": {"KyrgyzMMLU": 28.6, "KyrgyzRC": 66.0, "WinoGrande": 50.5, "BoolQ": 59.4, "HellaSwag": 22.0, "GSM8K": 0.7, "TruthfulQA": 34.2}, |
|
|
"Qwen2.5-7B-Instruct": {"KyrgyzMMLU": 31.5, "KyrgyzRC": 70.0, "WinoGrande": 48.7, "BoolQ": 56.3, "HellaSwag": 10.0, "GSM8K": 1.1, "TruthfulQA": 34.1}, |
|
|
"Qwen3-0.6B": {"KyrgyzMMLU": 26.0, "KyrgyzRC": 61.8, "WinoGrande": 49.8, "BoolQ": 38.0, "HellaSwag": 11.1, "GSM8K": 0.7, "TruthfulQA": 29.9}, |
|
|
"Qwen3-1.7B": {"KyrgyzMMLU": 27.9, "KyrgyzRC": 61.8, "WinoGrande": 48.9, "BoolQ": 40.4, "HellaSwag": 24.6, "GSM8K": 0.7, "TruthfulQA": 29.6}, |
|
|
"Qwen3-4B": {"KyrgyzMMLU": 30.3, "KyrgyzRC": 68.2, "WinoGrande": 49.0, "BoolQ": 38.3, "HellaSwag": 24.5, "GSM8K": 0.7, "TruthfulQA": 32.9}, |
|
|
"Qwen3-8B": {"KyrgyzMMLU": 32.1, "KyrgyzRC": 71.8, "WinoGrande": 51.0, "BoolQ": 39.2, "HellaSwag": 24.6, "GSM8K": 0.7, "TruthfulQA": 34.7}, |
|
|
|
|
|
"gemma-3-1b-it": {"KyrgyzMMLU": 26.7, "KyrgyzRC": 58.2, "WinoGrande": 50.0, "BoolQ": 37.9, "HellaSwag": 24.4, "GSM8K": 0.7, "TruthfulQA": 34.0}, |
|
|
"gemma-3-270m": {"KyrgyzMMLU": 27.5, "KyrgyzRC": 56.8, "WinoGrande": 48.3, "BoolQ": 37.9, "HellaSwag": 17.4, "GSM8K": 0.7, "TruthfulQA": 34.7}, |
|
|
"gemma-3-4b-it": {"KyrgyzMMLU": 30.3, "KyrgyzRC": 70.2, "WinoGrande": 50.6, "BoolQ": 58.3, "HellaSwag": 24.6, "GSM8K": 0.7, "TruthfulQA": 34.7}, |
|
|
|
|
|
"Llama-3.1-8B-Instruct": {"KyrgyzMMLU": 31.0, "KyrgyzRC": 75.2, "WinoGrande": 50.6, "BoolQ": 50.3, "HellaSwag": 26.6, "GSM8K": 0.7, "TruthfulQA": 33.7}, |
|
|
"Llama-3.2-1B-Instruct": {"KyrgyzMMLU": 26.3, "KyrgyzRC": 58.2, "WinoGrande": 49.4, "BoolQ": 38.3, "HellaSwag": 0.2, "GSM8K": 0.7, "TruthfulQA": 30.1}, |
|
|
"Llama-3.2-3B-Instruct": {"KyrgyzMMLU": 27.8, "KyrgyzRC": 64.2, "WinoGrande": 49.1, "BoolQ": 43.1, "HellaSwag": 24.5, "GSM8K": 0.7, "TruthfulQA": 31.5}, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
FEW_SHOT: Dict[str, Dict[str, float]] = { |
|
|
|
|
|
"Qwen2.5-0.5B-Instruct": {"KyrgyzMMLU": 25.4, "KyrgyzRC": 54.0, "WinoGrande": 49.7, "BoolQ": 61.0, "HellaSwag": 25.9, "GSM8K": 2.2, "TruthfulQA": 33.4}, |
|
|
"Qwen2.5-1.5B-Instruct": {"KyrgyzMMLU": 28.7, "KyrgyzRC": 67.5, "WinoGrande": 50.1, "BoolQ": 58.0, "HellaSwag": 26.5, "GSM8K": 6.1, "TruthfulQA": 32.9}, |
|
|
"Qwen2.5-3B-Instruct": {"KyrgyzMMLU": 34.0, "KyrgyzRC": 73.2, "WinoGrande": 51.3, "BoolQ": 57.4, "HellaSwag": 23.7, "GSM8K": 9.5, "TruthfulQA": 34.4}, |
|
|
"Qwen2.5-7B-Instruct": {"KyrgyzMMLU": 38.5, "KyrgyzRC": 74.8, "WinoGrande": 50.4, "BoolQ": 64.6, "HellaSwag": 17.8, "GSM8K": 32.1, "TruthfulQA": 36.2}, |
|
|
"Qwen3-0.6B": {"KyrgyzMMLU": 26.8, "KyrgyzRC": 59.5, "WinoGrande": 50.1, "BoolQ": 60.1, "HellaSwag": 26.4, "GSM8K": 4.3, "TruthfulQA": 30.0}, |
|
|
"Qwen3-1.7B": {"KyrgyzMMLU": 30.8, "KyrgyzRC": 71.2, "WinoGrande": 48.6, "BoolQ": 62.0, "HellaSwag": 25.2, "GSM8K": 18.5, "TruthfulQA": 30.3}, |
|
|
"Qwen3-4B": {"KyrgyzMMLU": 38.5, "KyrgyzRC": 77.2, "WinoGrande": 48.1, "BoolQ": 74.0, "HellaSwag": 24.7, "GSM8K": 51.5, "TruthfulQA": 32.5}, |
|
|
"Qwen3-8B": {"KyrgyzMMLU": 44.5, "KyrgyzRC": 81.8, "WinoGrande": 50.6, "BoolQ": 76.9, "HellaSwag": 26.4, "GSM8K": 60.0, "TruthfulQA": 35.8}, |
|
|
|
|
|
"gemma-3-1b-it": {"KyrgyzMMLU": 26.5, "KyrgyzRC": 38.0, "WinoGrande": 48.9, "BoolQ": 62.8, "HellaSwag": 23.5, "GSM8K": 3.2, "TruthfulQA": 31.3}, |
|
|
"gemma-3-270m": {"KyrgyzMMLU": 27.0, "KyrgyzRC": 53.2, "WinoGrande": 48.7, "BoolQ": 61.5, "HellaSwag": 27.6, "GSM8K": 1.4, "TruthfulQA": 36.6}, |
|
|
"gemma-3-4b-it": {"KyrgyzMMLU": 29.5, "KyrgyzRC": 25.0, "WinoGrande": 49.6, "BoolQ": 62.1, "HellaSwag": 24.6, "GSM8K": 0.0, "TruthfulQA": 50.0}, |
|
|
|
|
|
"Llama-3.1-8B-Instruct": {"KyrgyzMMLU": 38.1, "KyrgyzRC": 80.5, "WinoGrande": 51.6, "BoolQ": 75.5, "HellaSwag": 21.9, "GSM8K": 37.0, "TruthfulQA": 34.4}, |
|
|
"Llama-3.2-1B-Instruct": {"KyrgyzMMLU": 26.1, "KyrgyzRC": 45.8, "WinoGrande": 49.7, "BoolQ": 62.0, "HellaSwag": 25.8, "GSM8K": 2.7, "TruthfulQA": 30.3}, |
|
|
"Llama-3.2-3B-Instruct": {"KyrgyzMMLU": 29.4, "KyrgyzRC": 64.8, "WinoGrande": 48.9, "BoolQ": 62.3, "HellaSwag": 25.3, "GSM8K": 12.9, "TruthfulQA": 32.9}, |
|
|
} |
|
|
|
|
|
|
|
|
def to_result_json(model_id: str, metrics: Dict[str, float]) -> Dict: |
|
|
return { |
|
|
"config": { |
|
|
"model_name": model_id, |
|
|
"model_dtype": "float16", |
|
|
"model_sha": "main", |
|
|
}, |
|
|
"results": {k: {"metric_name": (v / 100.0) if v is not None else None} for k, v in metrics.items()}, |
|
|
} |
|
|
|
|
|
|
|
|
def write_results(out_dir: str, table: Dict[str, Dict[str, float]], tag: str): |
|
|
os.makedirs(out_dir, exist_ok=True) |
|
|
for display_name, metrics in table.items(): |
|
|
model_id = MODEL_ID_MAP.get(display_name, f"TTimur/{display_name}") |
|
|
payload = to_result_json(model_id, metrics) |
|
|
|
|
|
safe_model = model_id.replace("/", "__") |
|
|
out_path = os.path.join(out_dir, f"results_{safe_model}_{tag}.json") |
|
|
with open(out_path, "w") as f: |
|
|
json.dump(payload, f, ensure_ascii=False, indent=2) |
|
|
print(f"Wrote {out_path}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
out_dir = os.environ.get("OUT_DIR", "./kyrgyz_results") |
|
|
write_results(out_dir, ZERO_SHOT, "zero_shot") |
|
|
write_results(out_dir, FEW_SHOT, "few_shot") |
|
|
print("Done. Upload generated JSONs to your dataset repo (e.g., TTimur/results_kg_v0.1)") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|
|
|
|