plotstables/filtering.ipynb

{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KcKgxgI2V3j8", "outputId": "2ab0792a-26d7-4844-9a3d-56db43c0d2e4" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "fatal: destination path 'lm1-2b8-55b-c4-perplexity' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-c4-repetitions' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-c4seeds' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-c4-perplexity' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-c4-repetitions' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-c4seeds' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-oscar-perplexity' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-oscar-repetitions' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-oscarseeds' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-oscar-perplexity' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-oscar-repetitions' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-oscarseeds' already exists and is not an empty directory.\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n" ] } ], "source": [ "import glob\n", "import os\n", "import pandas as pd\n", "\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4-perplexity\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4-repetitions\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4seeds\n", "\n", "\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4-perplexity\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4-repetitions\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4seeds\n", "\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscar-perplexity\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscar-repetitions\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscarseeds\n", "\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscar-perplexity\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscar-repetitions\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscarseeds\n", "\n", "TASK_TO_BASELINE = {\n", " \"anli_r1\": 1/3,\n", " \"anli_r2\": 1/3,\n", " \"anli_r3\": 1/3,\n", " \"arc_challenge\": 1/4,\n", " \"arc_easy\": 1/4,\n", " \"boolq\": 1/2,\n", " \"cb\": 1/3,\n", " \"copa\": 1/2,\n", " \"hellaswag\": 1/4,\n", " \"piqa\": 1/2,\n", " \"rte\": 1/2,\n", " \"sciq\": 1/4,\n", " \"storycloze_2016\": 1/4,\n", " \"winogrande\": 1/2,\n", " \"babi\": 0,\n", "}\n", "\n", "MODELS = [\n", " \"lm1-2b8-55b-c4-repetitions/2b855b55bc4\",\n", " \"lm1-2b8-55b-c4seeds/2b855b55bc4seed1\",\n", " \"lm1-2b8-55b-c4seeds/2b855b55bc4seed2\",\n", " \"lm1-2b8-55b-c4seeds/2b855b55bc4seed3\",\n", " \"lm1-2b8-55b-c4seeds/2b855b55bc4seed4\",\n", "\n", " \"lm1-2b8-55b-c4-perplexity/perplexity25\",\n", " \"lm1-2b8-55b-c4-perplexity/perplexity50\",\n", "\n", "\n", " \"lm1-4b2-84b-c4-repetitions/4b284b84bc4\",\n", " \"lm1-4b2-84b-c4seeds/4b284b84bc4seed1\",\n", " \"lm1-4b2-84b-c4seeds/4b284b84bc4seed2\",\n", " \"lm1-4b2-84b-c4seeds/4b284b84bc4seed3\",\n", " \"lm1-4b2-84b-c4seeds/4b284b84bc4seed4\",\n", "\n", " \"lm1-4b2-84b-c4-perplexity/perplexity25\",\n", " \"lm1-4b2-84b-c4-perplexity/perplexity50\",\n", " \"lm1-4b2-84b-c4-perplexity/perplexity2575\",\n", "\n", "\n", " \"lm1-2b8-55b-oscar-repetitions/2b855b55boscar\",\n", " \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed1\",\n", " \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed2\",\n", " \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed3\",\n", " \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed4\",\n", " \"lm1-2b8-55b-oscar-perplexity/perplexity25\",\n", "\n", " \"lm1-4b2-84b-oscar-repetitions/4b284b84boscar\",\n", " \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed1\",\n", " \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed2\",\n", " \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed3\",\n", " \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed4\",\n", "\n", " \"lm1-4b2-84b-oscar-perplexity/4b284boscarperplexity25\",\n", "]\n", "\n", "\n", "MODEL_TO_FEWSHOT_SCORES = {}\n", "MODEL_TO_FEWSHOT_SCORES_ACC = {}\n", "MODEL_TO_FEWSHOT_SCORES_GEN = {}\n", "\n", "SHOTS = list(range(6))\n", "\n", "for MODEL in MODELS:\n", " MODEL_TO_FEWSHOT_SCORES.setdefault(MODEL, {})\n", " MODEL_TO_FEWSHOT_SCORES_ACC.setdefault(MODEL, {})\n", " MODEL_TO_FEWSHOT_SCORES_GEN.setdefault(MODEL, {})\n", "\n", " path = f\"{MODEL}/evaluation/generation/merged.csv\"\n", " if not os.path.exists(path):\n", " print(\"Skipping: \", {path})\n", " continue\n", " generation = pd.read_csv(path)\n", " for SHOT in SHOTS: \n", " \n", " MODEL_TO_FEWSHOT_SCORES[MODEL].setdefault(SHOT, {})\n", " MODEL_TO_FEWSHOT_SCORES_ACC[MODEL].setdefault(SHOT, {})\n", " MODEL_TO_FEWSHOT_SCORES_GEN[MODEL].setdefault(SHOT, {})\n", "\n", " rankeval_files = glob.glob(f\"{MODEL}/evaluation/rankeval/*_{SHOT}.csv\")\n", " assert len(rankeval_files) == 1, f\"{rankeval_files}\"\n", " rankeval = pd.read_csv(rankeval_files[0])\n", "\n", " # Rescale to 0 - 1, where 0 is random performance\n", " rankeval[\"normalized\"] = rankeval.apply(lambda x: (x[\"value\"] - TASK_TO_BASELINE[x[\"task\"]]) / (1 - TASK_TO_BASELINE[x[\"task\"]]), axis=1)\n", " rankeval = rankeval[rankeval[\"metric\"] == \"acc\"]\n", " rankeval_scores = rankeval.normalized.values.tolist()\n", "\n", " gen_sub = generation[generation[\"fewshots\"] == SHOT]\n", " gen_sub = gen_sub[gen_sub[\"prompt\"] != \"median\"]\n", " gen_sub = gen_sub[gen_sub[\"prompt\"] != \"average\"]\n", "\n", " # 0 is already random performance, i.e. no rescaling necessary\n", " generation_scores = gen_sub.value.values.tolist()\n", " scores = rankeval_scores + generation_scores\n", " average_score = sum(scores) / len(scores)\n", "\n", " for i, row in rankeval.iterrows():\n", " MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][row[\"task\"]] = row[\"normalized\"]\n", " for i, row in gen_sub.iterrows():\n", " MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][row[\"dataset\"]] = row[\"value\"]\n", "\n", " babi_files = glob.glob(f\"{MODEL}/evaluation/*{SHOT}_babi.json\")\n", " if len(babi_files) == 1:\n", " import json\n", " with open(babi_files[0], \"r\") as f:\n", " MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][\"babi\"] = json.load(f)[\"results\"][\"babi\"][\"em\"]\n", " else:\n", " print(f\"Missing bAbI: {MODEL}; Setting to 0 (which is OK for C4)\")\n", " MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][\"babi\"] = 0\n", "\n", " #MODEL_TO_FEWSHOT_SCORES[BASE_MODEL][MODEL][SHOT]\n", " #MODEL_TO_FEWSHOT_SCORES_ACC[BASE_MODEL][MODEL].append(sum(rankeval_scores) / len(rankeval_scores))\n", " #MODEL_TO_FEWSHOT_SCORES_GEN[BASE_MODEL][MODEL].append(sum(generation_scores) / len(generation_scores))\n" ] }, { "cell_type": "code", "source": [ "### Latex Table ###\n", "\n", "import numpy as np\n", "\n", "TASK_TO_NAME = {\n", " \"anli_r1\": \"ANLI R1\",\n", " \"anli_r2\": \"ANLI R2\",\n", " \"anli_r3\": \"ANLI R3\",\n", " \"arc_challenge\": \"ARC-Challenge\",\n", " \"arc_easy\": \"ARC-Easy\",\n", " \"boolq\": \"BoolQ\",\n", " \"cb\": \"CB\",\n", " \"copa\": \"COPA\",\n", " \"hellaswag\": \"HellaSwag\",\n", " \"piqa\": \"PiQA\",\n", " \"rte\": \"RTE\",\n", " \"sciq\": \"SciQ\",\n", " \"storycloze_2016\": \"StoryCloze 2016\",\n", " \"winogrande\": \"WinoGrande XL\",\n", "\n", " \"e2e_nlg_cleaned\": \"E2E NLG\",\n", " \"gem_xsum\": \"XSUM\",\n", " \"web_nlg_en\": \"WebNLG EN\",\n", " \"wiki_lingua_en\": \"WikiLingua EN\",\n", "\n", " \"babi\": \"bAbI\", \n", "}\n", "\n", "### Table 1: OSCAR + C4 2b8\n", "average = True\n", "SHOT = 5\n", "\n", "TABLE = \"\"\"\n", "\\\\begin{table}\n", " \\\\centering\n", " \\\\caption{Perplexity}\n", " \\\\resizebox{\\\\textwidth}{!}{\n", " \\\\begin{tabular}{l|lllllll|llll}\n", "\"\"\"\n", "\n", "HEADER_A = \"Training Data & \\multicolumn{7}{c|}{C4} & \\multicolumn{4}{c}{OSCAR}\" + \" \\\\\\\\\"\n", "HEADER_B = \"Parameters & \\multicolumn{3}{c|}{2.8B parameters} & \\multicolumn{4}{c|}{4.2B parameters} & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters}\" + \" \\\\\\\\\"\n", "#HEADER_C = \"Dataset & No filtering & Top 25\\\\% perplexity & Top 50\\\\% Perplexity & Top 25\\\\% perplexity & Top 50\\\\% Perplexity & 25\\\\% - 75\\\\% perplexity & No filtering & Top 25\\\\% perplexity & No filtering & Top 25\\\\% perplexity\" + \" \\\\\\\\\"\n", "HEADER_C = \"Dataset & All & Top 25\\\\% & Top 50\\\\% & All & 25\\\\% & 50\\\\% & 25\\\\% - 75\\\\% & All & Top 25\\\\% & All & Top 25\\\\% perplexity\" + \" \\\\\\\\\"\n", "\n", "TABLE += \"\\n\\\\midrule\\n\".join([HEADER_A, HEADER_B, HEADER_C])\n", "\n", "for task in TASK_TO_NAME:\n", " scores_str = []\n", " for model in MODELS:\n", " if \"seed\" in model: continue\n", " if \"perplexity\" not in model:\n", " model_scores = []\n", " modelseeds = model.replace(\"-repetitions\", \"seeds\")\n", " for model_name in [model+\"\", modelseeds+\"seed1\", modelseeds+\"seed2\", modelseeds+\"seed3\", modelseeds+\"seed4\"]:\n", " model_scores.append(sum([MODEL_TO_FEWSHOT_SCORES[model_name][SHOT][task] for SHOT in [0, 1, 2, 3, 4, 5]]) / 6)\n", " avg = np.mean(model_scores)\n", " std = np.std(model_scores)\n", " scores_str.append(f\"{round(avg * 100, 1)} ± {str(round(std * 100, 1))}\")\n", " \n", " else:\n", " avg = sum([MODEL_TO_FEWSHOT_SCORES[model][SHOT][task] for SHOT in [0, 1, 2, 3, 4, 5]]) / 6\n", " scores_str.append(str(round(avg * 100, 1)))\n", " \n", " TABLE += \"\\n\" + f\"{TASK_TO_NAME[task]} & \" + \" & \".join(scores_str) + \" \\\\\\\\\"\n", "\n", " if task in (\"winogrande\", \"wiki_lingua_en\", \"babi\"):\n", " TABLE += \"\\n\" + \"\\\\midrule\"\n", "\n", "\n", "# Add average\n", "scores_avg_str = []\n", "for model in MODELS:\n", " if \"seed\" in model: continue\n", " if \"perplexity\" not in model:\n", " model_scores = []\n", " modelseeds = model.replace(\"-repetitions\", \"seeds\")\n", " for model_name in [model+\"\", modelseeds+\"seed1\", modelseeds+\"seed2\", modelseeds+\"seed3\", modelseeds+\"seed4\"]:\n", " scores = [list(MODEL_TO_FEWSHOT_SCORES[model_name][SHOT].values()) for SHOT in [0, 1, 2, 3, 4, 5]]\n", " for task_scores in scores:\n", " assert len(task_scores) == len(TASK_TO_NAME) \n", " model_scores.append(\n", " np.mean([sub_score for shot_scores in scores for sub_score in shot_scores])\n", " )\n", " avg = np.mean(model_scores)\n", " std = np.std(model_scores)\n", " scores_avg_str.append(f\"{round(avg * 100, 1)} ± {str(round(std * 100, 1))}\")\n", "\n", " else:\n", " scores = [list(MODEL_TO_FEWSHOT_SCORES[model][SHOT].values()) for SHOT in [0, 1, 2, 3, 4, 5]]\n", " for task_scores in scores:\n", " assert len(task_scores) == len(TASK_TO_NAME)\n", " score_avg = [sub_score for shot_scores in scores for sub_score in shot_scores]\n", " avg = sum(score_avg) / len(score_avg)\n", " scores_avg_str.append(f\"{round(avg * 100, 1)}\")\n", "\n", "#std = np.std(scores_avg[:5])\n", "#scores_avg_std = str(round(((sum(scores_avg[:5]) / 5) * 100), 1)) + f\" ± {str(round(std * 100, 1))}\"\n", "\n", "#std_oscar = np.std(scores_avg[-6:-1])\n", "#scores_avg_std_oscar = str(round(((sum(scores_avg[-6:-1]) / 5) * 100), 1)) + f\" ± {str(round(std_oscar * 100, 1))}\"\n", "\n", "#scores_avg_str_c4 = scores_avg_std + \" & \" + \" & \".join([str(round(score_avg * 100, 1)) for score_avg in scores_avg[5:-6]])\n", "#scores_avg_str_oscar = scores_avg_std_oscar + \" & \" + \" & \".join([str(round(score_avg * 100, 1)) for score_avg in scores_avg[-1:]])\n", "\n", "\n", "#TABLE += \"\\nAverage & \" + scores_avg_str_c4 + \" & \" + scores_avg_str_oscar + \" \\\\\\\\\"\n", "\n", "TABLE += \"\\nAverage & \" + \" & \".join(scores_avg_str) + \" \\\\\\\\\"\n", "TABLE += \"\\n\\\\bottomrule\"\n", "\n", "TABLE += \"\"\"\n", "\\bottomrule\n", " \\end{tabular}\n", " }\n", " \\label{tab:pplx}\n", "\\end{table}\n", "\"\"\"\n", "\n", "print(TABLE)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Rp7ILYgsXZyH", "outputId": "e8f72ce8-389c-4226-9c44-55de50c852b7" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "\\begin{table}\n", " \\centering\n", " \\caption{Perplexity}\n", " \\resizebox{\\textwidth}{!}{\n", " \\begin{tabular}{l|lllllll|llll}\n", "Training Data & \\multicolumn{7}{c|}{C4} & \\multicolumn{4}{c}{OSCAR} \\\\\n", "\\midrule\n", "Parameters & \\multicolumn{3}{c|}{2.8B parameters} & \\multicolumn{4}{c|}{4.2B parameters} & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters} \\\\\n", "\\midrule\n", "Dataset & All & Top 25\\% & Top 50\\% & All & 25\\% & 50\\% & 25\\% - 75\\% & All & Top 25\\% & All & Top 25\\% perplexity \\\\\n", "ANLI R1 & 0.4 ± 1.6 & -0.1 & 0.9 & -0.5 ± 1.4 & -0.0 & -0.7 & -0.8 & -0.3 ± 0.5 & -0.4 & -0.4 ± 1.2 & -2.2 \\\\\n", "ANLI R2 & 0.9 ± 0.4 & -0.2 & -0.7 & 0.0 ± 1.3 & -0.4 & -0.0 & 1.1 & 1.0 ± 1.0 & 1.7 & 1.0 ± 0.9 & 0.7 \\\\\n", "ANLI R3 & 1.7 ± 0.5 & 0.5 & 1.4 & 0.7 ± 0.5 & 0.7 & 2.9 & 0.4 & 0.4 ± 0.8 & 1.7 & 1.2 ± 0.5 & 2.1 \\\\\n", "ARC-Challenge & 1.6 ± 1.0 & 3.3 & 2.9 & 4.2 ± 1.6 & 10.2 & 9.3 & 7.9 & -1.4 ± 0.8 & 3.3 & 1.8 ± 0.8 & 6.3 \\\\\n", "ARC-Easy & 44.5 ± 0.5 & 47.3 & 47.7 & 48.1 ± 4.8 & 55.8 & 53.7 & 51.0 & 39.7 ± 0.3 & 46.8 & 45.7 ± 0.6 & 51.8 \\\\\n", "BoolQ & 18.8 ± 3.4 & 17.1 & 17.7 & 22.4 ± 3.3 & 27.7 & 23.5 & 24.5 & 12.8 ± 4.4 & 11.8 & 12.4 ± 5.9 & 22.2 \\\\\n", "CB & 20.0 ± 4.7 & 16.1 & 13.8 & 9.3 ± 16.6 & 24.6 & 22.3 & 12.5 & 19.7 ± 5.1 & 17.0 & 23.9 ± 3.8 & 20.1 \\\\\n", "COPA & 49.7 ± 3.5 & 55.7 & 56.0 & 55.3 ± 3.8 & 60.7 & 66.0 & 61.0 & 42.7 ± 2.2 & 44.0 & 41.1 ± 3.0 & 49.3 \\\\\n", "HellaSwag & 24.7 ± 0.3 & 24.7 & 26.0 & 29.4 ± 1.3 & 30.7 & 32.7 & 33.1 & 16.3 ± 0.1 & 19.0 & 21.0 ± 0.2 & 23.3 \\\\\n", "PiQA & 47.9 ± 0.6 & 43.4 & 45.8 & 48.8 ± 3.8 & 47.9 & 52.2 & 52.1 & 41.2 ± 0.7 & 38.3 & 45.0 ± 0.6 & 44.4 \\\\\n", "RTE & 5.1 ± 4.0 & 5.7 & 7.3 & 6.9 ± 3.1 & 11.9 & 2.2 & 10.3 & 3.9 ± 1.1 & -1.2 & 2.2 ± 4.3 & 7.0 \\\\\n", "SciQ & 83.2 ± 0.6 & 82.4 & 82.8 & 86.3 ± 1.1 & 88.6 & 87.4 & 88.4 & 83.2 ± 0.6 & 84.0 & 86.3 ± 0.6 & 86.5 \\\\\n", "StoryCloze 2016 & 58.7 ± 0.2 & 61.1 & 61.2 & 62.8 ± 0.5 & 65.5 & 65.6 & 65.1 & 52.8 ± 0.3 & 57.9 & 57.2 ± 0.6 & 60.2 \\\\\n", "WinoGrande XL & 11.6 ± 0.8 & 15.3 & 14.3 & 18.7 ± 1.0 & 24.9 & 22.3 & 18.7 & 5.8 ± 0.9 & 9.7 & 10.1 ± 1.0 & 14.8 \\\\\n", "\\midrule\n", "E2E NLG & 17.0 ± 1.4 & 16.1 & 16.8 & 17.9 ± 0.7 & 18.8 & 17.8 & 19.2 & 20.3 ± 0.3 & 19.5 & 21.6 ± 0.7 & 22.6 \\\\\n", "XSUM & 2.4 ± 0.1 & 2.6 & 3.0 & 3.0 ± 0.3 & 3.9 & 3.2 & 3.0 & 3.0 ± 0.1 & 3.2 & 3.7 ± 0.2 & 2.7 \\\\\n", "WebNLG EN & 5.3 ± 0.1 & 4.8 & 5.1 & 5.6 ± 0.3 & 5.4 & 5.7 & 5.2 & 8.8 ± 0.4 & 6.9 & 9.3 ± 0.5 & 10.6 \\\\\n", "WikiLingua EN & 3.0 ± 0.1 & 3.2 & 3.3 & 3.6 ± 0.2 & 3.4 & 3.5 & 3.4 & 2.9 ± 0.1 & 3.4 & 4.0 ± 0.1 & 3.8 \\\\\n", "\\midrule\n", "bAbI & 0.0 ± 0.0 & 0.0 & 0.0 & 0.0 ± 0.0 & 0.0 & 0.0 & 0.0 & 15.5 ± 1.0 & 14.5 & 19.3 ± 1.0 & 17.2 \\\\\n", "\\midrule\n", "Average & 20.9 ± 0.4 & 21.0 & 21.3 & 22.2 ± 1.4 & 25.3 & 24.7 & 24.0 & 19.4 ± 0.5 & 20.1 & 21.4 ± 0.5 & 23.3 \\\\\n", "\\bottomrule\n", "\bottomrule\n", " \\end{tabular}\n", " }\n", " \\label{tab:pplx}\n", "\\end{table}\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import glob\n", "import os\n", "import pandas as pd\n", "\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-dedup\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4-repetitions\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4seeds\n", "\n", "\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4-dedup\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4-repetitions\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4seeds\n", "\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscardedup25\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscar-repetitions\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscarseeds\n", "\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscar-dedup\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscar-repetitions\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscarseeds\n", "\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscarroots\n", "!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscarroots\n", "\n", "\n", "TASK_TO_BASELINE = {\n", " \"anli_r1\": 1/3,\n", " \"anli_r2\": 1/3,\n", " \"anli_r3\": 1/3,\n", " \"arc_challenge\": 1/4,\n", " \"arc_easy\": 1/4,\n", " \"boolq\": 1/2,\n", " \"cb\": 1/3,\n", " \"copa\": 1/2,\n", " \"hellaswag\": 1/4,\n", " \"piqa\": 1/2,\n", " \"rte\": 1/2,\n", " \"sciq\": 1/4,\n", " \"storycloze_2016\": 1/4,\n", " \"winogrande\": 1/2,\n", " \"babi\": 0,\n", "}\n", "\n", "MODELS = [\n", " \"lm1-2b8-55b-c4-repetitions/2b855b55bc4\",\n", " \"lm1-2b8-55b-c4seeds/2b855b55bc4seed1\",\n", " \"lm1-2b8-55b-c4seeds/2b855b55bc4seed2\",\n", " \"lm1-2b8-55b-c4seeds/2b855b55bc4seed3\",\n", " \"lm1-2b8-55b-c4seeds/2b855b55bc4seed4\",\n", " \"lm1-2b8-55b-dedup\",\n", "\n", " \"lm1-4b2-84b-c4-repetitions/4b284b84bc4\",\n", " \"lm1-4b2-84b-c4seeds/4b284b84bc4seed1\",\n", " \"lm1-4b2-84b-c4seeds/4b284b84bc4seed2\",\n", " \"lm1-4b2-84b-c4seeds/4b284b84bc4seed3\",\n", " \"lm1-4b2-84b-c4seeds/4b284b84bc4seed4\",\n", " \"lm1-4b2-84b-c4-dedup/4b284bc4dedup\",\n", "\n", " \"lm1-2b8-55b-oscar-repetitions/2b855b55boscar\",\n", " \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed1\",\n", " \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed2\",\n", " \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed3\",\n", " \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed4\",\n", " \"lm1-2b8-55b-oscardedup25/\",\n", " \"lm1-2b8-55b-oscarroots\",\n", "\n", " \"lm1-4b2-84b-oscar-repetitions/4b284b84boscar\",\n", " \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed1\",\n", " \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed2\",\n", " \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed3\",\n", " \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed4\",\n", " \"lm1-4b2-84b-oscar-dedup/4b284b84boscardedup25expanded\",\n", " \"lm1-4b2-84b-oscarroots\",\n", "]\n", "\n", "\n", "MODEL_TO_FEWSHOT_SCORES = {}\n", "MODEL_TO_FEWSHOT_SCORES_ACC = {}\n", "MODEL_TO_FEWSHOT_SCORES_GEN = {}\n", "\n", "SHOTS = list(range(6))\n", "\n", "for MODEL in MODELS:\n", " MODEL_TO_FEWSHOT_SCORES.setdefault(MODEL, {})\n", " MODEL_TO_FEWSHOT_SCORES_ACC.setdefault(MODEL, {})\n", " MODEL_TO_FEWSHOT_SCORES_GEN.setdefault(MODEL, {})\n", "\n", " path = f\"{MODEL}/evaluation/generation/merged.csv\"\n", " if not os.path.exists(path):\n", " print(\"Skipping: \", {path})\n", " continue\n", " generation = pd.read_csv(path)\n", " for SHOT in SHOTS: \n", " \n", " MODEL_TO_FEWSHOT_SCORES[MODEL].setdefault(SHOT, {})\n", " MODEL_TO_FEWSHOT_SCORES_ACC[MODEL].setdefault(SHOT, {})\n", " MODEL_TO_FEWSHOT_SCORES_GEN[MODEL].setdefault(SHOT, {})\n", "\n", " rankeval_files = glob.glob(f\"{MODEL}/evaluation/rankeval/*_{SHOT}.csv\")\n", " assert len(rankeval_files) == 1, f\"{rankeval_files}\"\n", " rankeval = pd.read_csv(rankeval_files[0])\n", "\n", " # Rescale to 0 - 1, where 0 is random performance\n", " rankeval[\"normalized\"] = rankeval.apply(lambda x: (x[\"value\"] - TASK_TO_BASELINE[x[\"task\"]]) / (1 - TASK_TO_BASELINE[x[\"task\"]]), axis=1)\n", " rankeval = rankeval[rankeval[\"metric\"] == \"acc\"]\n", " rankeval_scores = rankeval.normalized.values.tolist()\n", "\n", " gen_sub = generation[generation[\"fewshots\"] == SHOT]\n", " gen_sub = gen_sub[gen_sub[\"prompt\"] != \"median\"]\n", " gen_sub = gen_sub[gen_sub[\"prompt\"] != \"average\"]\n", "\n", " # 0 is already random performance, i.e. no rescaling necessary\n", " generation_scores = gen_sub.value.values.tolist()\n", " scores = rankeval_scores + generation_scores\n", " average_score = sum(scores) / len(scores)\n", "\n", " for i, row in rankeval.iterrows():\n", " MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][row[\"task\"]] = row[\"normalized\"]\n", " for i, row in gen_sub.iterrows():\n", " MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][row[\"dataset\"]] = row[\"value\"]\n", "\n", " babi_files = glob.glob(f\"{MODEL}/evaluation/*{SHOT}_babi.json\")\n", " if len(babi_files) == 1:\n", " import json\n", " with open(babi_files[0], \"r\") as f:\n", " MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][\"babi\"] = json.load(f)[\"results\"][\"babi\"][\"em\"]\n", " else:\n", " print(f\"Missing bAbI: {MODEL}; Setting to 0 (which is OK for C4)\")\n", " MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][\"babi\"] = 0\n", "\n", " #MODEL_TO_FEWSHOT_SCORES[BASE_MODEL][MODEL][SHOT]\n", " #MODEL_TO_FEWSHOT_SCORES_ACC[BASE_MODEL][MODEL].append(sum(rankeval_scores) / len(rankeval_scores))\n", " #MODEL_TO_FEWSHOT_SCORES_GEN[BASE_MODEL][MODEL].append(sum(generation_scores) / len(generation_scores))\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nKLZFSCWXnHN", "outputId": "4f6f4dbc-db43-4304-a6aa-69887bd01bdc" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "fatal: destination path 'lm1-2b8-55b-dedup' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-c4-repetitions' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-c4seeds' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-c4-dedup' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-c4-repetitions' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-c4seeds' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-oscardedup25' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-oscar-repetitions' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-oscarseeds' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-oscar-dedup' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-oscar-repetitions' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-oscarseeds' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-2b8-55b-oscarroots' already exists and is not an empty directory.\n", "fatal: destination path 'lm1-4b2-84b-oscarroots' already exists and is not an empty directory.\n", "Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n", "Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n" ] } ] }, { "cell_type": "code", "source": [ "### Latex Table ###\n", "\n", "import numpy as np\n", "\n", "TASK_TO_NAME = {\n", " \"anli_r1\": \"ANLI R1\",\n", " \"anli_r2\": \"ANLI R2\",\n", " \"anli_r3\": \"ANLI R3\",\n", " \"arc_challenge\": \"ARC-Challenge\",\n", " \"arc_easy\": \"ARC-Easy\",\n", " \"boolq\": \"BoolQ\",\n", " \"cb\": \"CB\",\n", " \"copa\": \"COPA\",\n", " \"hellaswag\": \"HellaSwag\",\n", " \"piqa\": \"PiQA\",\n", " \"rte\": \"RTE\",\n", " \"sciq\": \"SciQ\",\n", " \"storycloze_2016\": \"StoryCloze 2016\",\n", " \"winogrande\": \"WinoGrande XL\",\n", "\n", " \"e2e_nlg_cleaned\": \"E2E NLG\",\n", " \"gem_xsum\": \"XSUM\",\n", " \"web_nlg_en\": \"WebNLG EN\",\n", " \"wiki_lingua_en\": \"WikiLingua EN\",\n", "\n", " \"babi\": \"bAbI\", \n", "}\n", "\n", "### Table 1: OSCAR + C4 2b8\n", "average = True\n", "SHOT = 5\n", "\n", "TABLE = \"\"\"\n", "\\\\begin{table}\n", " \\\\centering\n", " \\\\caption{\\textbf{Results after filtering with deduplication.}}\n", " \\\\resizebox{\\\\textwidth}{!}{\n", " \\\\begin{tabular}{l|llll|llll}\n", " \\toprule \n", "\"\"\"\n", "\n", "HEADER_A = \"Training Data & \\multicolumn{4}{c|}{C4} & \\multicolumn{4}{c}{OSCAR}\" + \" \\\\\\\\\"\n", "HEADER_B = \"Parameters & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters} & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters}\" + \" \\\\\\\\\"\n", "HEADER_C = \"Dataset & All & Dedup & All & Dedup & All & Dedup & All & Dedup\" + \" \\\\\\\\\"\n", "\n", "TABLE += \"\\n\\\\midrule\\n\".join([HEADER_A, HEADER_B, HEADER_C])\n", "\n", "for task in TASK_TO_NAME:\n", " scores_str = []\n", " for model in MODELS:\n", " if \"seed\" in model: continue\n", " if (\"dedup\" not in model) and (\"roots\" not in model):\n", " model_scores = []\n", " modelseeds = model.replace(\"-repetitions\", \"seeds\")\n", " for model_name in [model+\"\", modelseeds+\"seed1\", modelseeds+\"seed2\", modelseeds+\"seed3\", modelseeds+\"seed4\"]:\n", " model_scores.append(sum([MODEL_TO_FEWSHOT_SCORES[model_name][SHOT][task] for SHOT in [0, 1, 2, 3, 4, 5]]) / 6)\n", " avg = np.mean(model_scores)\n", " std = np.std(model_scores)\n", " scores_str.append(f\"{round(avg * 100, 1)} ± {str(round(std * 100, 1))}\")\n", " \n", " else:\n", " avg = sum([MODEL_TO_FEWSHOT_SCORES[model][SHOT][task] for SHOT in [0, 1, 2, 3, 4, 5]]) / 6\n", " scores_str.append(str(round(avg * 100, 1)))\n", " \n", " TABLE += \"\\n\" + f\"{TASK_TO_NAME[task]} & \" + \" & \".join(scores_str) + \" \\\\\\\\\"\n", "\n", " if task in (\"winogrande\", \"wiki_lingua_en\", \"babi\"):\n", " TABLE += \"\\n\" + \"\\\\midrule\"\n", "\n", "\n", "# Add average\n", "scores_avg_str = []\n", "for model in MODELS:\n", " if \"seed\" in model: continue\n", " if (\"dedup\" not in model) and (\"roots\" not in model):\n", " model_scores = []\n", " modelseeds = model.replace(\"-repetitions\", \"seeds\")\n", " for model_name in [model+\"\", modelseeds+\"seed1\", modelseeds+\"seed2\", modelseeds+\"seed3\", modelseeds+\"seed4\"]:\n", " scores = [list(MODEL_TO_FEWSHOT_SCORES[model_name][SHOT].values()) for SHOT in [0, 1, 2, 3, 4, 5]]\n", " for task_scores in scores:\n", " assert len(task_scores) == len(TASK_TO_NAME) \n", " model_scores.append(\n", " np.mean([sub_score for shot_scores in scores for sub_score in shot_scores])\n", " )\n", " avg = np.mean(model_scores)\n", " std = np.std(model_scores)\n", " scores_avg_str.append(f\"{round(avg * 100, 1)} ± {str(round(std * 100, 1))}\")\n", "\n", " else:\n", " scores = [list(MODEL_TO_FEWSHOT_SCORES[model][SHOT].values()) for SHOT in [0, 1, 2, 3, 4, 5]]\n", " for task_scores in scores:\n", " assert len(task_scores) == len(TASK_TO_NAME)\n", " score_avg = [sub_score for shot_scores in scores for sub_score in shot_scores]\n", " avg = sum(score_avg) / len(score_avg)\n", " scores_avg_str.append(f\"{round(avg * 100, 1)}\")\n", "\n", "TABLE += \"\\nAverage & \" + \" & \".join(scores_avg_str) + \" \\\\\\\\\"\n", "TABLE += \"\\n\\\\bottomrule\"\n", "\n", "TABLE += \"\"\"\n", "\\bottomrule\n", " \\end{tabular}\n", " }\n", " \\label{tab:dedup}\n", "\\end{table}\n", "\"\"\"\n", "\n", "print(TABLE)" ], "metadata": { "id": "gy-vQlPyZG7c", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "85464b57-c629-495b-c8b2-8a38de221c65" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "\\begin{table}\n", " \\centering\n", " \\caption{\textbf{Results after filtering with deduplication.}}\n", " \\resizebox{\\textwidth}{!}{\n", " \\begin{tabular}{l|llll|llll}\n", " \toprule \n", "Training Data & \\multicolumn{4}{c|}{C4} & \\multicolumn{4}{c}{OSCAR} \\\\\n", "\\midrule\n", "Parameters & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters} & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters} \\\\\n", "\\midrule\n", "Dataset & All & Dedup & All & Dedup & All & Dedup & All & Dedup \\\\\n", "ANLI R1 & 0.4 ± 1.6 & -0.2 & -0.5 ± 1.4 & -0.8 & -0.3 ± 0.5 & -2.1 & -1.7 & -0.4 ± 1.2 & -1.8 & 1.2 \\\\\n", "ANLI R2 & 0.9 ± 0.4 & 1.1 & 0.0 ± 1.3 & -0.1 & 1.0 ± 1.0 & 2.0 & 0.7 & 1.0 ± 0.9 & -0.5 & -0.3 \\\\\n", "ANLI R3 & 1.7 ± 0.5 & 1.8 & 0.7 ± 0.5 & 0.4 & 0.4 ± 0.8 & 0.4 & 0.2 & 1.2 ± 0.5 & 0.8 & -0.3 \\\\\n", "ARC-Challenge & 1.6 ± 1.0 & 0.6 & 4.2 ± 1.6 & 3.9 & -1.4 ± 0.8 & 2.6 & -0.9 & 1.8 ± 0.8 & 6.8 & 0.6 \\\\\n", "ARC-Easy & 44.5 ± 0.5 & 43.0 & 48.1 ± 4.8 & 46.8 & 39.7 ± 0.3 & 44.6 & 42.3 & 45.7 ± 0.6 & 51.0 & 47.1 \\\\\n", "BoolQ & 18.8 ± 3.4 & 1.5 & 22.4 ± 3.3 & 2.2 & 12.8 ± 4.4 & 3.4 & 13.4 & 12.4 ± 5.9 & 13.0 & 7.0 \\\\\n", "CB & 20.0 ± 4.7 & 0.4 & 9.3 ± 16.6 & 0.9 & 19.7 ± 5.1 & 25.4 & 14.3 & 23.9 ± 3.8 & 25.0 & 28.1 \\\\\n", "COPA & 49.7 ± 3.5 & 57.0 & 55.3 ± 3.8 & 60.0 & 42.7 ± 2.2 & 47.3 & 37.7 & 41.1 ± 3.0 & 55.3 & 43.0 \\\\\n", "HellaSwag & 24.7 ± 0.3 & 25.1 & 29.4 ± 1.3 & 30.7 & 16.3 ± 0.1 & 22.8 & 17.6 & 21.0 ± 0.2 & 26.3 & 22.4 \\\\\n", "PiQA & 47.9 ± 0.6 & 49.1 & 48.8 ± 3.8 & 53.4 & 41.2 ± 0.7 & 45.1 & 41.9 & 45.0 ± 0.6 & 48.5 & 46.3 \\\\\n", "RTE & 5.1 ± 4.0 & 3.2 & 6.9 ± 3.1 & 0.1 & 3.9 ± 1.1 & 6.1 & 5.8 & 2.2 ± 4.3 & 1.1 & 8.9 \\\\\n", "SciQ & 83.2 ± 0.6 & 80.4 & 86.3 ± 1.1 & 82.2 & 83.2 ± 0.6 & 82.6 & 83.1 & 86.3 ± 0.6 & 88.5 & 86.4 \\\\\n", "StoryCloze 2016 & 58.7 ± 0.2 & 61.8 & 62.8 ± 0.5 & 65.2 & 52.8 ± 0.3 & 58.1 & 54.3 & 57.2 ± 0.6 & 61.6 & 58.6 \\\\\n", "WinoGrande XL & 11.6 ± 0.8 & 13.3 & 18.7 ± 1.0 & 19.7 & 5.8 ± 0.9 & 12.7 & 5.6 & 10.1 ± 1.0 & 16.2 & 11.0 \\\\\n", "\\midrule\n", "E2E NLG & 17.0 ± 1.4 & 15.6 & 17.9 ± 0.7 & 14.2 & 20.3 ± 0.3 & 20.5 & 20.5 & 21.6 ± 0.7 & 2.4 & 22.6 \\\\\n", "XSUM & 2.4 ± 0.1 & 2.1 & 3.0 ± 0.3 & 2.5 & 3.0 ± 0.1 & 3.2 & 3.1 & 3.7 ± 0.2 & 4.6 & 3.8 \\\\\n", "WebNLG EN & 5.3 ± 0.1 & 4.3 & 5.6 ± 0.3 & 4.4 & 8.8 ± 0.4 & 7.4 & 7.4 & 9.3 ± 0.5 & 9.7 & 9.4 \\\\\n", "WikiLingua EN & 3.0 ± 0.1 & 3.2 & 3.6 ± 0.2 & 3.2 & 2.9 ± 0.1 & 3.0 & 3.1 & 4.0 ± 0.1 & 4.3 & 4.0 \\\\\n", "\\midrule\n", "bAbI & 0.0 ± 0.0 & 0.0 & 0.0 ± 0.0 & 0.0 & 15.5 ± 1.0 & 17.2 & 14.3 & 19.3 ± 1.0 & 21.1 & 18.0 \\\\\n", "\\midrule\n", "Average & 20.9 ± 0.4 & 19.1 & 22.2 ± 1.4 & 20.5 & 19.4 ± 0.5 & 21.2 & 19.1 & 21.4 ± 0.5 & 22.8 & 22.0 \\\\\n", "\\bottomrule\n", "\bottomrule\n", " \\end{tabular}\n", " }\n", " \\label{tab:dedup}\n", "\\end{table}\n", "\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "9PizHEDmZBpO" }, "execution_count": null, "outputs": [] } ] }

plotstables/filtering.ipynb (800 lines of code) (raw):