File size: 16,380 Bytes

b0c0df0

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_1 = \"gpt-4o\"\n",
    "model_2 = \"Qwen-VL-72B-Instruct\"\n",
    "\n",
    "model_1 = model_1.replace(\"-\", \"_\")\n",
    "model_2 = model_2.replace(\"-\", \"_\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "\n",
    "data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-09\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_data_1 = data[model_1].to_pandas()\n",
    "model_data_2 = data[model_2].to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask',\n",
       "       'website', 'response', 'score', 'reason'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_data_1.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "same_data = [\"images\", \"question\", \"ground_truth\", \"criteria\", \"subtask\", \"website\"]\n",
    "model_data_2.drop(columns=same_data, inplace=True)\n",
    "merged_data = model_data_1.merge(model_data_2, on=\"id\", suffixes=(f\"_1\", f\"_2\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>images</th>\n",
       "      <th>question</th>\n",
       "      <th>ground_truth</th>\n",
       "      <th>criteria</th>\n",
       "      <th>subtask</th>\n",
       "      <th>website</th>\n",
       "      <th>response_1</th>\n",
       "      <th>score_1</th>\n",
       "      <th>reason_1</th>\n",
       "      <th>response_2</th>\n",
       "      <th>score_2</th>\n",
       "      <th>reason_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>41</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>Identify each vulnerability's CVE identifier a...</td>\n",
       "      <td>1. CVE Identifier: CVE-2024-29824\\nDescription...</td>\n",
       "      <td>Total score: 10 points.\\n\\nScoring:\\n- For eac...</td>\n",
       "      <td>Concrete Recognition</td>\n",
       "      <td>technology</td>\n",
       "      <td>Here are the vulnerabilities along with their ...</td>\n",
       "      <td>10.00</td>\n",
       "      <td>The assistant response correctly identified al...</td>\n",
       "      <td>1. **CVE-2024-29824**\\n   - **Description**: I...</td>\n",
       "      <td>10.00</td>\n",
       "      <td>The assistant response correctly identifies ea...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>274</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>List all the news headlines in the order they ...</td>\n",
       "      <td>1. 'I'm amazed I lived to tell the tale': Awar...</td>\n",
       "      <td>0-1 correct: 0 points, 2-4 correct: 4 points, ...</td>\n",
       "      <td>Concrete Recognition</td>\n",
       "      <td>artandculture</td>\n",
       "      <td>1. **‘I’m amazed I lived to tell the tale’: Aw...</td>\n",
       "      <td>10.00</td>\n",
       "      <td>The assistant response correctly matches all t...</td>\n",
       "      <td></td>\n",
       "      <td>0.00</td>\n",
       "      <td>No response</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>215</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>List all the news headlines related to the 202...</td>\n",
       "      <td>Here are the news headlines related to the 202...</td>\n",
       "      <td>The total score is 10 points. Listing each hea...</td>\n",
       "      <td>Concrete Recognition</td>\n",
       "      <td>sports</td>\n",
       "      <td>Here are the news headlines related to the 202...</td>\n",
       "      <td>4.29</td>\n",
       "      <td>The assistant response correctly identified th...</td>\n",
       "      <td>1. \"Who will be the No. 1 pick in the 2025 NBA...</td>\n",
       "      <td>8.58</td>\n",
       "      <td>The assistant response correctly identified 6 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>215</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>List all the news headlines related to the 202...</td>\n",
       "      <td>Here are the news headlines related to the 202...</td>\n",
       "      <td>The total score is 10 points. Listing each hea...</td>\n",
       "      <td>Concrete Recognition</td>\n",
       "      <td>sports</td>\n",
       "      <td>Here are the news headlines related to the 202...</td>\n",
       "      <td>4.29</td>\n",
       "      <td>The assistant response correctly identified th...</td>\n",
       "      <td>```markdown\\n# Interview with Abdoulaye Trésor...</td>\n",
       "      <td>10.00</td>\n",
       "      <td>The assistant's response accurately captures t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>18</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>Identify and list the three formats provided b...</td>\n",
       "      <td>The three formats provided by the new annotati...</td>\n",
       "      <td>To score the response:\\n\\n- Award 10 points if...</td>\n",
       "      <td>Concrete Recognition</td>\n",
       "      <td>technology</td>\n",
       "      <td>The new `annotationlib` module in Python 3.14 ...</td>\n",
       "      <td>10.00</td>\n",
       "      <td>The assistant correctly identified and describ...</td>\n",
       "      <td>The three formats provided by the new annotati...</td>\n",
       "      <td>10.00</td>\n",
       "      <td>The assistant correctly identified and describ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    id                                             images  \\\n",
       "0   41  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "1  274  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "2  215  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "3  215  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "4   18  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "\n",
       "                                            question  \\\n",
       "0  Identify each vulnerability's CVE identifier a...   \n",
       "1  List all the news headlines in the order they ...   \n",
       "2  List all the news headlines related to the 202...   \n",
       "3  List all the news headlines related to the 202...   \n",
       "4  Identify and list the three formats provided b...   \n",
       "\n",
       "                                        ground_truth  \\\n",
       "0  1. CVE Identifier: CVE-2024-29824\\nDescription...   \n",
       "1  1. 'I'm amazed I lived to tell the tale': Awar...   \n",
       "2  Here are the news headlines related to the 202...   \n",
       "3  Here are the news headlines related to the 202...   \n",
       "4  The three formats provided by the new annotati...   \n",
       "\n",
       "                                            criteria               subtask  \\\n",
       "0  Total score: 10 points.\\n\\nScoring:\\n- For eac...  Concrete Recognition   \n",
       "1  0-1 correct: 0 points, 2-4 correct: 4 points, ...  Concrete Recognition   \n",
       "2  The total score is 10 points. Listing each hea...  Concrete Recognition   \n",
       "3  The total score is 10 points. Listing each hea...  Concrete Recognition   \n",
       "4  To score the response:\\n\\n- Award 10 points if...  Concrete Recognition   \n",
       "\n",
       "         website                                         response_1  score_1  \\\n",
       "0     technology  Here are the vulnerabilities along with their ...    10.00   \n",
       "1  artandculture  1. **‘I’m amazed I lived to tell the tale’: Aw...    10.00   \n",
       "2         sports  Here are the news headlines related to the 202...     4.29   \n",
       "3         sports  Here are the news headlines related to the 202...     4.29   \n",
       "4     technology  The new `annotationlib` module in Python 3.14 ...    10.00   \n",
       "\n",
       "                                            reason_1  \\\n",
       "0  The assistant response correctly identified al...   \n",
       "1  The assistant response correctly matches all t...   \n",
       "2  The assistant response correctly identified th...   \n",
       "3  The assistant response correctly identified th...   \n",
       "4  The assistant correctly identified and describ...   \n",
       "\n",
       "                                          response_2  score_2  \\\n",
       "0  1. **CVE-2024-29824**\\n   - **Description**: I...    10.00   \n",
       "1                                                        0.00   \n",
       "2  1. \"Who will be the No. 1 pick in the 2025 NBA...     8.58   \n",
       "3  ```markdown\\n# Interview with Abdoulaye Trésor...    10.00   \n",
       "4  The three formats provided by the new annotati...    10.00   \n",
       "\n",
       "                                            reason_2  \n",
       "0  The assistant response correctly identifies ea...  \n",
       "1                                        No response  \n",
       "2  The assistant response correctly identified 6 ...  \n",
       "3  The assistant's response accurately captures t...  \n",
       "4  The assistant correctly identified and describ...  "
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "delta = merged_data[\"score_1\"] - merged_data[\"score_2\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_data[\"delta\"] = delta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_data = merged_data.sort_values(\"delta\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask',\n",
       "       'website', 'response_1', 'score_1', 'reason_1', 'response_2', 'score_2',\n",
       "       'reason_2', 'delta'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = data[model_1].features.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "for feature in [\"score\", \"reason\", \"response\"]:\n",
    "    feat = features.pop(feature)\n",
    "    features[f\"{feature}_1\"] = feat\n",
    "    features[f\"{feature}_2\"] = feat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "features[\"delta\"] = datasets.Value(\"float32\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': Value(dtype='int32', id=None),\n",
       " 'images': Sequence(feature=Image(mode=None, decode=True, id=None), length=-1, id=None),\n",
       " 'question': Value(dtype='string', id=None),\n",
       " 'ground_truth': Value(dtype='string', id=None),\n",
       " 'criteria': Value(dtype='string', id=None),\n",
       " 'subtask': Value(dtype='string', id=None),\n",
       " 'website': Value(dtype='string', id=None),\n",
       " 'score_1': Value(dtype='float32', id=None),\n",
       " 'score_2': Value(dtype='float32', id=None),\n",
       " 'reason_1': Value(dtype='string', id=None),\n",
       " 'reason_2': Value(dtype='string', id=None),\n",
       " 'response_1': Value(dtype='string', id=None),\n",
       " 'response_2': Value(dtype='string', id=None),\n",
       " 'delta': Value(dtype='float32', id=None)}"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 218 examples [00:00, 329.76 examples/s]\n"
     ]
    }
   ],
   "source": [
    "def gen():\n",
    "    for i, row in sorted_data.iterrows():\n",
    "        yield row.to_dict()\n",
    "\n",
    "\n",
    "final_data = datasets.Dataset.from_generator(gen, features=features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 218/218 [00:00<00:00, 497.97 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  7.03ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:14<00:00, 14.44s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchDetailedResultsComparison/commit/f0941a0afeb32f8f0aae1861e6ef658a237bb249', commit_message='Upload dataset', commit_description='', oid='f0941a0afeb32f8f0aae1861e6ef658a237bb249', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lmms-lab/LiveBenchDetailedResultsComparison', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lmms-lab/LiveBenchDetailedResultsComparison'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_data.push_to_hub(\"lmms-lab/LiveBenchDetailedResultsComparison\", \"2024-09\", split=f\"{model_1}_vs_{model_2}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "live_bench",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}