| import argparse | |
| import json | |
| import os | |
| from typing import Dict, List, Union | |
| import datasets | |
| import numpy as np | |
| import pandas as pd | |
| from datasets import Dataset, load_dataset | |
| from PIL import Image | |
| from tqdm import tqdm | |
| EPS = 1e-6 | |
| RESULT_FEATURES = { | |
| "id": datasets.Value("int32"), | |
| "images": datasets.Sequence(datasets.Image()), | |
| "question": datasets.Value("string"), | |
| "ground_truth": datasets.Value("string"), | |
| "criteria": datasets.Value("string"), | |
| "subtask": datasets.Value("string"), | |
| "response": datasets.Value("string"), | |
| "score": datasets.Value("float32"), | |
| "reason": datasets.Value("string"), | |
| } | |
| SUBTASKS = ["Concrete Recognition", "Analytical Questions", "Evaluative Questions", "Divergent Thinking", "Real-world Assistance"] | |
| def load_images(config) -> Dict[int, List[Image.Image]]: | |
| dataset = datasets.load_dataset(config["dataset_path"], config["dataset_name"], split=config["test_split"]) | |
| images = {} | |
| for data in tqdm(dataset, desc="Loading images"): | |
| images[data["id"]] = data["images"] | |
| return images | |
| def get_hf_results(results, detailed_results): | |
| live_bench_images = load_images(results["configs"]["live_bench"]) | |
| def load_results(): | |
| for result in tqdm(detailed_results["logs"], desc="Loading results"): | |
| doc = result["doc"] | |
| res = {} | |
| res["id"] = doc["id"] | |
| res["images"] = live_bench_images[doc["id"]] | |
| res["question"] = doc["question"] | |
| res["ground_truth"] = doc["answer"] | |
| res["criteria"] = doc["criteria"] | |
| res["subtask"] = doc["subtask"] | |
| res["response"] = result["filtered_resps"][0] | |
| res["score"] = result["gpt4_eval_score"]["rating"] | |
| res["reason"] = result["gpt4_eval_score"]["explanation"] | |
| yield res | |
| result_dataset = Dataset.from_generator(load_results, features=datasets.Features(RESULT_FEATURES)) | |
| return result_dataset | |
| def preview_results(results, heading: str): | |
| HEADING = "=" * 15 + " " + heading + " " + "=" * 15 | |
| ENDING = "=" * len(HEADING) | |
| print(HEADING) | |
| print(results) | |
| print(ENDING) | |
| def calculate_score(results: Dataset): | |
| results = results.to_pandas() | |
| sum_score, count = 0, 0 | |
| score = {} | |
| for subtask in SUBTASKS: | |
| score[subtask] = [] | |
| for index, result in tqdm(results.iterrows(), total=len(results), desc="Calculating score"): | |
| if result["score"] == -1: | |
| continue | |
| sum_score += result["score"] / 10 | |
| count += 1 | |
| subtask = result["subtask"] | |
| if subtask not in SUBTASKS: | |
| subtask = "Further Insights" | |
| score[result["subtask"]].append(result["score"] / 10) | |
| res = [(subtask, len(score[subtask]), np.mean(score[subtask]) * 100) for subtask in SUBTASKS] | |
| res.append(("Total", count, sum_score / count * 100)) | |
| res = pd.DataFrame(res, columns=["Subtask", "Count", "Score"]) | |
| return res | |
| def get_results(folder): | |
| detailed_file = os.path.join(folder, "live_bench.json") | |
| results_file = os.path.join(folder, "results.json") | |
| with open(results_file, "r") as f: | |
| results = json.load(f) | |
| assert "live_bench" in results["configs"], "No live_bench config found in results.json" | |
| final_score = results["results"]["live_bench"]["gpt4_eval_score,none"] | |
| model_configs = results["model_configs"] | |
| version = results["configs"]["live_bench"]["metadata"]["version"] | |
| assert model_configs["limit"] is None, "Model limit is not None, please check if the model is tested on the full dataset" | |
| with open(detailed_file, "r") as f: | |
| detailed_results = json.load(f) | |
| hf_results = get_hf_results(results, detailed_results) | |
| preview_results(hf_results.to_pandas().iloc[0], "Detailed Results") | |
| score = calculate_score(hf_results) | |
| preview_results(score, "Final Score") | |
| assert ( | |
| abs(score[score["Subtask"] == "Total"]["Score"] - final_score) <= EPS | |
| ).all(), f"Final score does not match the calculated score, the score calculated by the script is {score[score['Subtask'] == 'Total']['Score'].values[0]} and the final score in the log is {final_score}." | |
| return hf_results, score, version | |
| def upload_results( | |
| hf_results: Dataset, | |
| score: pd.DataFrame, | |
| model_name, | |
| dataset_version, | |
| log_folder="logs", | |
| ): | |
| hf_results.push_to_hub( | |
| "lmms-lab/LiveBenchDetailedResults", | |
| config_name=dataset_version, | |
| split=model_name.replace("-", "_"), | |
| ) | |
| if not os.path.exists(log_folder): | |
| os.makedirs(log_folder) | |
| score_path = os.path.abspath(os.path.join(log_folder, f"{dataset_version}_{model_name}.csv")) | |
| score.to_csv(score_path, index=False) | |
| print(f"Results saved to {score_path}") | |
| score_dict = {item["Subtask"]: item["Score"] for index, item in score.iterrows()} | |
| score_dict["Model Name"] = model_name | |
| try: | |
| hf_score = datasets.load_dataset("lmms-lab/LiveBenchResults", dataset_version, split="test") | |
| except: | |
| hf_score = Dataset.from_dict({subtask: [] for subtask in ["Model Name", "Total"] + SUBTASKS}) | |
| hf_score = hf_score.add_item(score_dict) | |
| df_score = pd.DataFrame(hf_score) | |
| df_score = df_score.drop_duplicates(subset=["Model Name"], keep="last") | |
| df_score = df_score[["Model Name", "Total"] + SUBTASKS] | |
| hf_score = Dataset.from_pandas(df_score) | |
| hf_score.push_to_hub("lmms-lab/LiveBenchResults", dataset_version, split="test") | |
| if __name__ == "__main__": | |
| argparse = argparse.ArgumentParser() | |
| argparse.add_argument("--folder", "-f", type=str, required=True, help="Results folder") | |
| argparse.add_argument("--name", "-m", type=str, required=True, help="Model name") | |
| argparse.add_argument("--log_folder", "-l", type=str, default="logs", help="Log folder") | |
| argparse.add_argument("--force", "-F", action="store_true", help="Force upload") | |
| args = argparse.parse_args() | |
| hf_results, score, version = get_results(args.folder) | |
| print(f"Results will be uploaded with model name {args.name} and model version {version}") | |
| if args.force is False: | |
| print("Are you sure you want to upload the results? (y/n)", end=" ") | |
| while True: | |
| choice = input().lower() | |
| if choice == "y": | |
| break | |
| elif choice == "n": | |
| exit() | |
| else: | |
| print("Invalid choice, please enter 'y' or 'n'") | |
| upload_results(hf_results, score, args.name, version, args.log_folder) | |