OpenLLMKyrgyzLeaderboard_v0.1 / scripts /generate_kyrgyz_results_json.py
TTimur's picture
Initial commit
c097229
raw
history blame
6.83 kB
import json
import os
from typing import Dict, List
# Maps display names in README to real Hub model IDs for better visibility (still_on_hub=True)
MODEL_ID_MAP = {
# Qwen
"Qwen2.5-0.5B-Instruct": "Qwen/Qwen2.5-0.5B-Instruct",
"Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
"Qwen2.5-3B-Instruct": "Qwen/Qwen2.5-3B-Instruct",
"Qwen2.5-7B-Instruct": "Qwen/Qwen2.5-7B-Instruct",
"Qwen3-0.6B": "Qwen/Qwen3-0.6B",
"Qwen3-1.7B": "Qwen/Qwen3-1.7B",
"Qwen3-4B": "Qwen/Qwen3-4B",
"Qwen3-8B": "Qwen/Qwen3-8B",
# Gemma
"gemma-3-270m": "google/gemma-3-270m",
"gemma-3-1b-it": "google/gemma-3-1b-it",
"gemma-3-4b-it": "google/gemma-3-4b-it",
# Llama
"Llama-3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct",
"Llama-3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct",
"Llama-3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
}
TASK_KEYS = [
"KyrgyzMMLU",
"KyrgyzRC",
"WinoGrande",
"BoolQ",
"HellaSwag",
"GSM8K",
"TruthfulQA",
]
# Percent values from README tables (Zero-shot)
ZERO_SHOT: Dict[str, Dict[str, float]] = {
# Qwen
"Qwen2.5-0.5B-Instruct": {"KyrgyzMMLU": 27.4, "KyrgyzRC": 53.2, "WinoGrande": 51.5, "BoolQ": 37.9, "HellaSwag": 14.6, "GSM8K": 0.7, "TruthfulQA": 33.5},
"Qwen2.5-1.5B-Instruct": {"KyrgyzMMLU": 27.9, "KyrgyzRC": 60.5, "WinoGrande": 50.1, "BoolQ": 38.6, "HellaSwag": 22.9, "GSM8K": 0.7, "TruthfulQA": 32.5},
"Qwen2.5-3B-Instruct": {"KyrgyzMMLU": 28.6, "KyrgyzRC": 66.0, "WinoGrande": 50.5, "BoolQ": 59.4, "HellaSwag": 22.0, "GSM8K": 0.7, "TruthfulQA": 34.2},
"Qwen2.5-7B-Instruct": {"KyrgyzMMLU": 31.5, "KyrgyzRC": 70.0, "WinoGrande": 48.7, "BoolQ": 56.3, "HellaSwag": 10.0, "GSM8K": 1.1, "TruthfulQA": 34.1},
"Qwen3-0.6B": {"KyrgyzMMLU": 26.0, "KyrgyzRC": 61.8, "WinoGrande": 49.8, "BoolQ": 38.0, "HellaSwag": 11.1, "GSM8K": 0.7, "TruthfulQA": 29.9},
"Qwen3-1.7B": {"KyrgyzMMLU": 27.9, "KyrgyzRC": 61.8, "WinoGrande": 48.9, "BoolQ": 40.4, "HellaSwag": 24.6, "GSM8K": 0.7, "TruthfulQA": 29.6},
"Qwen3-4B": {"KyrgyzMMLU": 30.3, "KyrgyzRC": 68.2, "WinoGrande": 49.0, "BoolQ": 38.3, "HellaSwag": 24.5, "GSM8K": 0.7, "TruthfulQA": 32.9},
"Qwen3-8B": {"KyrgyzMMLU": 32.1, "KyrgyzRC": 71.8, "WinoGrande": 51.0, "BoolQ": 39.2, "HellaSwag": 24.6, "GSM8K": 0.7, "TruthfulQA": 34.7},
# Gemma
"gemma-3-1b-it": {"KyrgyzMMLU": 26.7, "KyrgyzRC": 58.2, "WinoGrande": 50.0, "BoolQ": 37.9, "HellaSwag": 24.4, "GSM8K": 0.7, "TruthfulQA": 34.0},
"gemma-3-270m": {"KyrgyzMMLU": 27.5, "KyrgyzRC": 56.8, "WinoGrande": 48.3, "BoolQ": 37.9, "HellaSwag": 17.4, "GSM8K": 0.7, "TruthfulQA": 34.7},
"gemma-3-4b-it": {"KyrgyzMMLU": 30.3, "KyrgyzRC": 70.2, "WinoGrande": 50.6, "BoolQ": 58.3, "HellaSwag": 24.6, "GSM8K": 0.7, "TruthfulQA": 34.7},
# Llama
"Llama-3.1-8B-Instruct": {"KyrgyzMMLU": 31.0, "KyrgyzRC": 75.2, "WinoGrande": 50.6, "BoolQ": 50.3, "HellaSwag": 26.6, "GSM8K": 0.7, "TruthfulQA": 33.7},
"Llama-3.2-1B-Instruct": {"KyrgyzMMLU": 26.3, "KyrgyzRC": 58.2, "WinoGrande": 49.4, "BoolQ": 38.3, "HellaSwag": 0.2, "GSM8K": 0.7, "TruthfulQA": 30.1},
"Llama-3.2-3B-Instruct": {"KyrgyzMMLU": 27.8, "KyrgyzRC": 64.2, "WinoGrande": 49.1, "BoolQ": 43.1, "HellaSwag": 24.5, "GSM8K": 0.7, "TruthfulQA": 31.5},
}
# Percent values from README tables (Few-shot)
FEW_SHOT: Dict[str, Dict[str, float]] = {
# Qwen
"Qwen2.5-0.5B-Instruct": {"KyrgyzMMLU": 25.4, "KyrgyzRC": 54.0, "WinoGrande": 49.7, "BoolQ": 61.0, "HellaSwag": 25.9, "GSM8K": 2.2, "TruthfulQA": 33.4},
"Qwen2.5-1.5B-Instruct": {"KyrgyzMMLU": 28.7, "KyrgyzRC": 67.5, "WinoGrande": 50.1, "BoolQ": 58.0, "HellaSwag": 26.5, "GSM8K": 6.1, "TruthfulQA": 32.9},
"Qwen2.5-3B-Instruct": {"KyrgyzMMLU": 34.0, "KyrgyzRC": 73.2, "WinoGrande": 51.3, "BoolQ": 57.4, "HellaSwag": 23.7, "GSM8K": 9.5, "TruthfulQA": 34.4},
"Qwen2.5-7B-Instruct": {"KyrgyzMMLU": 38.5, "KyrgyzRC": 74.8, "WinoGrande": 50.4, "BoolQ": 64.6, "HellaSwag": 17.8, "GSM8K": 32.1, "TruthfulQA": 36.2},
"Qwen3-0.6B": {"KyrgyzMMLU": 26.8, "KyrgyzRC": 59.5, "WinoGrande": 50.1, "BoolQ": 60.1, "HellaSwag": 26.4, "GSM8K": 4.3, "TruthfulQA": 30.0},
"Qwen3-1.7B": {"KyrgyzMMLU": 30.8, "KyrgyzRC": 71.2, "WinoGrande": 48.6, "BoolQ": 62.0, "HellaSwag": 25.2, "GSM8K": 18.5, "TruthfulQA": 30.3},
"Qwen3-4B": {"KyrgyzMMLU": 38.5, "KyrgyzRC": 77.2, "WinoGrande": 48.1, "BoolQ": 74.0, "HellaSwag": 24.7, "GSM8K": 51.5, "TruthfulQA": 32.5},
"Qwen3-8B": {"KyrgyzMMLU": 44.5, "KyrgyzRC": 81.8, "WinoGrande": 50.6, "BoolQ": 76.9, "HellaSwag": 26.4, "GSM8K": 60.0, "TruthfulQA": 35.8},
# Gemma
"gemma-3-1b-it": {"KyrgyzMMLU": 26.5, "KyrgyzRC": 38.0, "WinoGrande": 48.9, "BoolQ": 62.8, "HellaSwag": 23.5, "GSM8K": 3.2, "TruthfulQA": 31.3},
"gemma-3-270m": {"KyrgyzMMLU": 27.0, "KyrgyzRC": 53.2, "WinoGrande": 48.7, "BoolQ": 61.5, "HellaSwag": 27.6, "GSM8K": 1.4, "TruthfulQA": 36.6},
"gemma-3-4b-it": {"KyrgyzMMLU": 29.5, "KyrgyzRC": 25.0, "WinoGrande": 49.6, "BoolQ": 62.1, "HellaSwag": 24.6, "GSM8K": 0.0, "TruthfulQA": 50.0},
# Llama
"Llama-3.1-8B-Instruct": {"KyrgyzMMLU": 38.1, "KyrgyzRC": 80.5, "WinoGrande": 51.6, "BoolQ": 75.5, "HellaSwag": 21.9, "GSM8K": 37.0, "TruthfulQA": 34.4},
"Llama-3.2-1B-Instruct": {"KyrgyzMMLU": 26.1, "KyrgyzRC": 45.8, "WinoGrande": 49.7, "BoolQ": 62.0, "HellaSwag": 25.8, "GSM8K": 2.7, "TruthfulQA": 30.3},
"Llama-3.2-3B-Instruct": {"KyrgyzMMLU": 29.4, "KyrgyzRC": 64.8, "WinoGrande": 48.9, "BoolQ": 62.3, "HellaSwag": 25.3, "GSM8K": 12.9, "TruthfulQA": 32.9},
}
def to_result_json(model_id: str, metrics: Dict[str, float]) -> Dict:
return {
"config": {
"model_name": model_id,
"model_dtype": "float16",
"model_sha": "main",
},
"results": {k: {"metric_name": (v / 100.0) if v is not None else None} for k, v in metrics.items()},
}
def write_results(out_dir: str, table: Dict[str, Dict[str, float]], tag: str):
os.makedirs(out_dir, exist_ok=True)
for display_name, metrics in table.items():
model_id = MODEL_ID_MAP.get(display_name, f"TTimur/{display_name}")
payload = to_result_json(model_id, metrics)
# Filename convention similar to other spaces; any name ending with .json is fine
safe_model = model_id.replace("/", "__")
out_path = os.path.join(out_dir, f"results_{safe_model}_{tag}.json")
with open(out_path, "w") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
print(f"Wrote {out_path}")
def main():
out_dir = os.environ.get("OUT_DIR", "./kyrgyz_results")
write_results(out_dir, ZERO_SHOT, "zero_shot")
write_results(out_dir, FEW_SHOT, "few_shot")
print("Done. Upload generated JSONs to your dataset repo (e.g., TTimur/results_kg_v0.1)")
if __name__ == "__main__":
main()