plotstables/filtering.ipynb (800 lines of code) (raw):
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "KcKgxgI2V3j8",
"outputId": "2ab0792a-26d7-4844-9a3d-56db43c0d2e4"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"fatal: destination path 'lm1-2b8-55b-c4-perplexity' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-c4-repetitions' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-c4seeds' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-c4-perplexity' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-c4-repetitions' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-c4seeds' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-oscar-perplexity' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-oscar-repetitions' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-oscarseeds' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-oscar-perplexity' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-oscar-repetitions' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-oscarseeds' already exists and is not an empty directory.\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity25; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-c4-perplexity/perplexity50; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n"
]
}
],
"source": [
"import glob\n",
"import os\n",
"import pandas as pd\n",
"\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4-perplexity\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4-repetitions\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4seeds\n",
"\n",
"\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4-perplexity\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4-repetitions\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4seeds\n",
"\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscar-perplexity\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscar-repetitions\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscarseeds\n",
"\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscar-perplexity\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscar-repetitions\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscarseeds\n",
"\n",
"TASK_TO_BASELINE = {\n",
" \"anli_r1\": 1/3,\n",
" \"anli_r2\": 1/3,\n",
" \"anli_r3\": 1/3,\n",
" \"arc_challenge\": 1/4,\n",
" \"arc_easy\": 1/4,\n",
" \"boolq\": 1/2,\n",
" \"cb\": 1/3,\n",
" \"copa\": 1/2,\n",
" \"hellaswag\": 1/4,\n",
" \"piqa\": 1/2,\n",
" \"rte\": 1/2,\n",
" \"sciq\": 1/4,\n",
" \"storycloze_2016\": 1/4,\n",
" \"winogrande\": 1/2,\n",
" \"babi\": 0,\n",
"}\n",
"\n",
"MODELS = [\n",
" \"lm1-2b8-55b-c4-repetitions/2b855b55bc4\",\n",
" \"lm1-2b8-55b-c4seeds/2b855b55bc4seed1\",\n",
" \"lm1-2b8-55b-c4seeds/2b855b55bc4seed2\",\n",
" \"lm1-2b8-55b-c4seeds/2b855b55bc4seed3\",\n",
" \"lm1-2b8-55b-c4seeds/2b855b55bc4seed4\",\n",
"\n",
" \"lm1-2b8-55b-c4-perplexity/perplexity25\",\n",
" \"lm1-2b8-55b-c4-perplexity/perplexity50\",\n",
"\n",
"\n",
" \"lm1-4b2-84b-c4-repetitions/4b284b84bc4\",\n",
" \"lm1-4b2-84b-c4seeds/4b284b84bc4seed1\",\n",
" \"lm1-4b2-84b-c4seeds/4b284b84bc4seed2\",\n",
" \"lm1-4b2-84b-c4seeds/4b284b84bc4seed3\",\n",
" \"lm1-4b2-84b-c4seeds/4b284b84bc4seed4\",\n",
"\n",
" \"lm1-4b2-84b-c4-perplexity/perplexity25\",\n",
" \"lm1-4b2-84b-c4-perplexity/perplexity50\",\n",
" \"lm1-4b2-84b-c4-perplexity/perplexity2575\",\n",
"\n",
"\n",
" \"lm1-2b8-55b-oscar-repetitions/2b855b55boscar\",\n",
" \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed1\",\n",
" \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed2\",\n",
" \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed3\",\n",
" \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed4\",\n",
" \"lm1-2b8-55b-oscar-perplexity/perplexity25\",\n",
"\n",
" \"lm1-4b2-84b-oscar-repetitions/4b284b84boscar\",\n",
" \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed1\",\n",
" \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed2\",\n",
" \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed3\",\n",
" \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed4\",\n",
"\n",
" \"lm1-4b2-84b-oscar-perplexity/4b284boscarperplexity25\",\n",
"]\n",
"\n",
"\n",
"MODEL_TO_FEWSHOT_SCORES = {}\n",
"MODEL_TO_FEWSHOT_SCORES_ACC = {}\n",
"MODEL_TO_FEWSHOT_SCORES_GEN = {}\n",
"\n",
"SHOTS = list(range(6))\n",
"\n",
"for MODEL in MODELS:\n",
" MODEL_TO_FEWSHOT_SCORES.setdefault(MODEL, {})\n",
" MODEL_TO_FEWSHOT_SCORES_ACC.setdefault(MODEL, {})\n",
" MODEL_TO_FEWSHOT_SCORES_GEN.setdefault(MODEL, {})\n",
"\n",
" path = f\"{MODEL}/evaluation/generation/merged.csv\"\n",
" if not os.path.exists(path):\n",
" print(\"Skipping: \", {path})\n",
" continue\n",
" generation = pd.read_csv(path)\n",
" for SHOT in SHOTS: \n",
" \n",
" MODEL_TO_FEWSHOT_SCORES[MODEL].setdefault(SHOT, {})\n",
" MODEL_TO_FEWSHOT_SCORES_ACC[MODEL].setdefault(SHOT, {})\n",
" MODEL_TO_FEWSHOT_SCORES_GEN[MODEL].setdefault(SHOT, {})\n",
"\n",
" rankeval_files = glob.glob(f\"{MODEL}/evaluation/rankeval/*_{SHOT}.csv\")\n",
" assert len(rankeval_files) == 1, f\"{rankeval_files}\"\n",
" rankeval = pd.read_csv(rankeval_files[0])\n",
"\n",
" # Rescale to 0 - 1, where 0 is random performance\n",
" rankeval[\"normalized\"] = rankeval.apply(lambda x: (x[\"value\"] - TASK_TO_BASELINE[x[\"task\"]]) / (1 - TASK_TO_BASELINE[x[\"task\"]]), axis=1)\n",
" rankeval = rankeval[rankeval[\"metric\"] == \"acc\"]\n",
" rankeval_scores = rankeval.normalized.values.tolist()\n",
"\n",
" gen_sub = generation[generation[\"fewshots\"] == SHOT]\n",
" gen_sub = gen_sub[gen_sub[\"prompt\"] != \"median\"]\n",
" gen_sub = gen_sub[gen_sub[\"prompt\"] != \"average\"]\n",
"\n",
" # 0 is already random performance, i.e. no rescaling necessary\n",
" generation_scores = gen_sub.value.values.tolist()\n",
" scores = rankeval_scores + generation_scores\n",
" average_score = sum(scores) / len(scores)\n",
"\n",
" for i, row in rankeval.iterrows():\n",
" MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][row[\"task\"]] = row[\"normalized\"]\n",
" for i, row in gen_sub.iterrows():\n",
" MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][row[\"dataset\"]] = row[\"value\"]\n",
"\n",
" babi_files = glob.glob(f\"{MODEL}/evaluation/*{SHOT}_babi.json\")\n",
" if len(babi_files) == 1:\n",
" import json\n",
" with open(babi_files[0], \"r\") as f:\n",
" MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][\"babi\"] = json.load(f)[\"results\"][\"babi\"][\"em\"]\n",
" else:\n",
" print(f\"Missing bAbI: {MODEL}; Setting to 0 (which is OK for C4)\")\n",
" MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][\"babi\"] = 0\n",
"\n",
" #MODEL_TO_FEWSHOT_SCORES[BASE_MODEL][MODEL][SHOT]\n",
" #MODEL_TO_FEWSHOT_SCORES_ACC[BASE_MODEL][MODEL].append(sum(rankeval_scores) / len(rankeval_scores))\n",
" #MODEL_TO_FEWSHOT_SCORES_GEN[BASE_MODEL][MODEL].append(sum(generation_scores) / len(generation_scores))\n"
]
},
{
"cell_type": "code",
"source": [
"### Latex Table ###\n",
"\n",
"import numpy as np\n",
"\n",
"TASK_TO_NAME = {\n",
" \"anli_r1\": \"ANLI R1\",\n",
" \"anli_r2\": \"ANLI R2\",\n",
" \"anli_r3\": \"ANLI R3\",\n",
" \"arc_challenge\": \"ARC-Challenge\",\n",
" \"arc_easy\": \"ARC-Easy\",\n",
" \"boolq\": \"BoolQ\",\n",
" \"cb\": \"CB\",\n",
" \"copa\": \"COPA\",\n",
" \"hellaswag\": \"HellaSwag\",\n",
" \"piqa\": \"PiQA\",\n",
" \"rte\": \"RTE\",\n",
" \"sciq\": \"SciQ\",\n",
" \"storycloze_2016\": \"StoryCloze 2016\",\n",
" \"winogrande\": \"WinoGrande XL\",\n",
"\n",
" \"e2e_nlg_cleaned\": \"E2E NLG\",\n",
" \"gem_xsum\": \"XSUM\",\n",
" \"web_nlg_en\": \"WebNLG EN\",\n",
" \"wiki_lingua_en\": \"WikiLingua EN\",\n",
"\n",
" \"babi\": \"bAbI\", \n",
"}\n",
"\n",
"### Table 1: OSCAR + C4 2b8\n",
"average = True\n",
"SHOT = 5\n",
"\n",
"TABLE = \"\"\"\n",
"\\\\begin{table}\n",
" \\\\centering\n",
" \\\\caption{Perplexity}\n",
" \\\\resizebox{\\\\textwidth}{!}{\n",
" \\\\begin{tabular}{l|lllllll|llll}\n",
"\"\"\"\n",
"\n",
"HEADER_A = \"Training Data & \\multicolumn{7}{c|}{C4} & \\multicolumn{4}{c}{OSCAR}\" + \" \\\\\\\\\"\n",
"HEADER_B = \"Parameters & \\multicolumn{3}{c|}{2.8B parameters} & \\multicolumn{4}{c|}{4.2B parameters} & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters}\" + \" \\\\\\\\\"\n",
"#HEADER_C = \"Dataset & No filtering & Top 25\\\\% perplexity & Top 50\\\\% Perplexity & Top 25\\\\% perplexity & Top 50\\\\% Perplexity & 25\\\\% - 75\\\\% perplexity & No filtering & Top 25\\\\% perplexity & No filtering & Top 25\\\\% perplexity\" + \" \\\\\\\\\"\n",
"HEADER_C = \"Dataset & All & Top 25\\\\% & Top 50\\\\% & All & 25\\\\% & 50\\\\% & 25\\\\% - 75\\\\% & All & Top 25\\\\% & All & Top 25\\\\% perplexity\" + \" \\\\\\\\\"\n",
"\n",
"TABLE += \"\\n\\\\midrule\\n\".join([HEADER_A, HEADER_B, HEADER_C])\n",
"\n",
"for task in TASK_TO_NAME:\n",
" scores_str = []\n",
" for model in MODELS:\n",
" if \"seed\" in model: continue\n",
" if \"perplexity\" not in model:\n",
" model_scores = []\n",
" modelseeds = model.replace(\"-repetitions\", \"seeds\")\n",
" for model_name in [model+\"\", modelseeds+\"seed1\", modelseeds+\"seed2\", modelseeds+\"seed3\", modelseeds+\"seed4\"]:\n",
" model_scores.append(sum([MODEL_TO_FEWSHOT_SCORES[model_name][SHOT][task] for SHOT in [0, 1, 2, 3, 4, 5]]) / 6)\n",
" avg = np.mean(model_scores)\n",
" std = np.std(model_scores)\n",
" scores_str.append(f\"{round(avg * 100, 1)} ± {str(round(std * 100, 1))}\")\n",
" \n",
" else:\n",
" avg = sum([MODEL_TO_FEWSHOT_SCORES[model][SHOT][task] for SHOT in [0, 1, 2, 3, 4, 5]]) / 6\n",
" scores_str.append(str(round(avg * 100, 1)))\n",
" \n",
" TABLE += \"\\n\" + f\"{TASK_TO_NAME[task]} & \" + \" & \".join(scores_str) + \" \\\\\\\\\"\n",
"\n",
" if task in (\"winogrande\", \"wiki_lingua_en\", \"babi\"):\n",
" TABLE += \"\\n\" + \"\\\\midrule\"\n",
"\n",
"\n",
"# Add average\n",
"scores_avg_str = []\n",
"for model in MODELS:\n",
" if \"seed\" in model: continue\n",
" if \"perplexity\" not in model:\n",
" model_scores = []\n",
" modelseeds = model.replace(\"-repetitions\", \"seeds\")\n",
" for model_name in [model+\"\", modelseeds+\"seed1\", modelseeds+\"seed2\", modelseeds+\"seed3\", modelseeds+\"seed4\"]:\n",
" scores = [list(MODEL_TO_FEWSHOT_SCORES[model_name][SHOT].values()) for SHOT in [0, 1, 2, 3, 4, 5]]\n",
" for task_scores in scores:\n",
" assert len(task_scores) == len(TASK_TO_NAME) \n",
" model_scores.append(\n",
" np.mean([sub_score for shot_scores in scores for sub_score in shot_scores])\n",
" )\n",
" avg = np.mean(model_scores)\n",
" std = np.std(model_scores)\n",
" scores_avg_str.append(f\"{round(avg * 100, 1)} ± {str(round(std * 100, 1))}\")\n",
"\n",
" else:\n",
" scores = [list(MODEL_TO_FEWSHOT_SCORES[model][SHOT].values()) for SHOT in [0, 1, 2, 3, 4, 5]]\n",
" for task_scores in scores:\n",
" assert len(task_scores) == len(TASK_TO_NAME)\n",
" score_avg = [sub_score for shot_scores in scores for sub_score in shot_scores]\n",
" avg = sum(score_avg) / len(score_avg)\n",
" scores_avg_str.append(f\"{round(avg * 100, 1)}\")\n",
"\n",
"#std = np.std(scores_avg[:5])\n",
"#scores_avg_std = str(round(((sum(scores_avg[:5]) / 5) * 100), 1)) + f\" ± {str(round(std * 100, 1))}\"\n",
"\n",
"#std_oscar = np.std(scores_avg[-6:-1])\n",
"#scores_avg_std_oscar = str(round(((sum(scores_avg[-6:-1]) / 5) * 100), 1)) + f\" ± {str(round(std_oscar * 100, 1))}\"\n",
"\n",
"#scores_avg_str_c4 = scores_avg_std + \" & \" + \" & \".join([str(round(score_avg * 100, 1)) for score_avg in scores_avg[5:-6]])\n",
"#scores_avg_str_oscar = scores_avg_std_oscar + \" & \" + \" & \".join([str(round(score_avg * 100, 1)) for score_avg in scores_avg[-1:]])\n",
"\n",
"\n",
"#TABLE += \"\\nAverage & \" + scores_avg_str_c4 + \" & \" + scores_avg_str_oscar + \" \\\\\\\\\"\n",
"\n",
"TABLE += \"\\nAverage & \" + \" & \".join(scores_avg_str) + \" \\\\\\\\\"\n",
"TABLE += \"\\n\\\\bottomrule\"\n",
"\n",
"TABLE += \"\"\"\n",
"\\bottomrule\n",
" \\end{tabular}\n",
" }\n",
" \\label{tab:pplx}\n",
"\\end{table}\n",
"\"\"\"\n",
"\n",
"print(TABLE)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Rp7ILYgsXZyH",
"outputId": "e8f72ce8-389c-4226-9c44-55de50c852b7"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"\\begin{table}\n",
" \\centering\n",
" \\caption{Perplexity}\n",
" \\resizebox{\\textwidth}{!}{\n",
" \\begin{tabular}{l|lllllll|llll}\n",
"Training Data & \\multicolumn{7}{c|}{C4} & \\multicolumn{4}{c}{OSCAR} \\\\\n",
"\\midrule\n",
"Parameters & \\multicolumn{3}{c|}{2.8B parameters} & \\multicolumn{4}{c|}{4.2B parameters} & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters} \\\\\n",
"\\midrule\n",
"Dataset & All & Top 25\\% & Top 50\\% & All & 25\\% & 50\\% & 25\\% - 75\\% & All & Top 25\\% & All & Top 25\\% perplexity \\\\\n",
"ANLI R1 & 0.4 ± 1.6 & -0.1 & 0.9 & -0.5 ± 1.4 & -0.0 & -0.7 & -0.8 & -0.3 ± 0.5 & -0.4 & -0.4 ± 1.2 & -2.2 \\\\\n",
"ANLI R2 & 0.9 ± 0.4 & -0.2 & -0.7 & 0.0 ± 1.3 & -0.4 & -0.0 & 1.1 & 1.0 ± 1.0 & 1.7 & 1.0 ± 0.9 & 0.7 \\\\\n",
"ANLI R3 & 1.7 ± 0.5 & 0.5 & 1.4 & 0.7 ± 0.5 & 0.7 & 2.9 & 0.4 & 0.4 ± 0.8 & 1.7 & 1.2 ± 0.5 & 2.1 \\\\\n",
"ARC-Challenge & 1.6 ± 1.0 & 3.3 & 2.9 & 4.2 ± 1.6 & 10.2 & 9.3 & 7.9 & -1.4 ± 0.8 & 3.3 & 1.8 ± 0.8 & 6.3 \\\\\n",
"ARC-Easy & 44.5 ± 0.5 & 47.3 & 47.7 & 48.1 ± 4.8 & 55.8 & 53.7 & 51.0 & 39.7 ± 0.3 & 46.8 & 45.7 ± 0.6 & 51.8 \\\\\n",
"BoolQ & 18.8 ± 3.4 & 17.1 & 17.7 & 22.4 ± 3.3 & 27.7 & 23.5 & 24.5 & 12.8 ± 4.4 & 11.8 & 12.4 ± 5.9 & 22.2 \\\\\n",
"CB & 20.0 ± 4.7 & 16.1 & 13.8 & 9.3 ± 16.6 & 24.6 & 22.3 & 12.5 & 19.7 ± 5.1 & 17.0 & 23.9 ± 3.8 & 20.1 \\\\\n",
"COPA & 49.7 ± 3.5 & 55.7 & 56.0 & 55.3 ± 3.8 & 60.7 & 66.0 & 61.0 & 42.7 ± 2.2 & 44.0 & 41.1 ± 3.0 & 49.3 \\\\\n",
"HellaSwag & 24.7 ± 0.3 & 24.7 & 26.0 & 29.4 ± 1.3 & 30.7 & 32.7 & 33.1 & 16.3 ± 0.1 & 19.0 & 21.0 ± 0.2 & 23.3 \\\\\n",
"PiQA & 47.9 ± 0.6 & 43.4 & 45.8 & 48.8 ± 3.8 & 47.9 & 52.2 & 52.1 & 41.2 ± 0.7 & 38.3 & 45.0 ± 0.6 & 44.4 \\\\\n",
"RTE & 5.1 ± 4.0 & 5.7 & 7.3 & 6.9 ± 3.1 & 11.9 & 2.2 & 10.3 & 3.9 ± 1.1 & -1.2 & 2.2 ± 4.3 & 7.0 \\\\\n",
"SciQ & 83.2 ± 0.6 & 82.4 & 82.8 & 86.3 ± 1.1 & 88.6 & 87.4 & 88.4 & 83.2 ± 0.6 & 84.0 & 86.3 ± 0.6 & 86.5 \\\\\n",
"StoryCloze 2016 & 58.7 ± 0.2 & 61.1 & 61.2 & 62.8 ± 0.5 & 65.5 & 65.6 & 65.1 & 52.8 ± 0.3 & 57.9 & 57.2 ± 0.6 & 60.2 \\\\\n",
"WinoGrande XL & 11.6 ± 0.8 & 15.3 & 14.3 & 18.7 ± 1.0 & 24.9 & 22.3 & 18.7 & 5.8 ± 0.9 & 9.7 & 10.1 ± 1.0 & 14.8 \\\\\n",
"\\midrule\n",
"E2E NLG & 17.0 ± 1.4 & 16.1 & 16.8 & 17.9 ± 0.7 & 18.8 & 17.8 & 19.2 & 20.3 ± 0.3 & 19.5 & 21.6 ± 0.7 & 22.6 \\\\\n",
"XSUM & 2.4 ± 0.1 & 2.6 & 3.0 & 3.0 ± 0.3 & 3.9 & 3.2 & 3.0 & 3.0 ± 0.1 & 3.2 & 3.7 ± 0.2 & 2.7 \\\\\n",
"WebNLG EN & 5.3 ± 0.1 & 4.8 & 5.1 & 5.6 ± 0.3 & 5.4 & 5.7 & 5.2 & 8.8 ± 0.4 & 6.9 & 9.3 ± 0.5 & 10.6 \\\\\n",
"WikiLingua EN & 3.0 ± 0.1 & 3.2 & 3.3 & 3.6 ± 0.2 & 3.4 & 3.5 & 3.4 & 2.9 ± 0.1 & 3.4 & 4.0 ± 0.1 & 3.8 \\\\\n",
"\\midrule\n",
"bAbI & 0.0 ± 0.0 & 0.0 & 0.0 & 0.0 ± 0.0 & 0.0 & 0.0 & 0.0 & 15.5 ± 1.0 & 14.5 & 19.3 ± 1.0 & 17.2 \\\\\n",
"\\midrule\n",
"Average & 20.9 ± 0.4 & 21.0 & 21.3 & 22.2 ± 1.4 & 25.3 & 24.7 & 24.0 & 19.4 ± 0.5 & 20.1 & 21.4 ± 0.5 & 23.3 \\\\\n",
"\\bottomrule\n",
"\bottomrule\n",
" \\end{tabular}\n",
" }\n",
" \\label{tab:pplx}\n",
"\\end{table}\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import glob\n",
"import os\n",
"import pandas as pd\n",
"\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-dedup\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4-repetitions\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-c4seeds\n",
"\n",
"\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4-dedup\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4-repetitions\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-c4seeds\n",
"\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscardedup25\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscar-repetitions\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscarseeds\n",
"\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscar-dedup\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscar-repetitions\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscarseeds\n",
"\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-2b8-55b-oscarroots\n",
"!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datablations/lm1-4b2-84b-oscarroots\n",
"\n",
"\n",
"TASK_TO_BASELINE = {\n",
" \"anli_r1\": 1/3,\n",
" \"anli_r2\": 1/3,\n",
" \"anli_r3\": 1/3,\n",
" \"arc_challenge\": 1/4,\n",
" \"arc_easy\": 1/4,\n",
" \"boolq\": 1/2,\n",
" \"cb\": 1/3,\n",
" \"copa\": 1/2,\n",
" \"hellaswag\": 1/4,\n",
" \"piqa\": 1/2,\n",
" \"rte\": 1/2,\n",
" \"sciq\": 1/4,\n",
" \"storycloze_2016\": 1/4,\n",
" \"winogrande\": 1/2,\n",
" \"babi\": 0,\n",
"}\n",
"\n",
"MODELS = [\n",
" \"lm1-2b8-55b-c4-repetitions/2b855b55bc4\",\n",
" \"lm1-2b8-55b-c4seeds/2b855b55bc4seed1\",\n",
" \"lm1-2b8-55b-c4seeds/2b855b55bc4seed2\",\n",
" \"lm1-2b8-55b-c4seeds/2b855b55bc4seed3\",\n",
" \"lm1-2b8-55b-c4seeds/2b855b55bc4seed4\",\n",
" \"lm1-2b8-55b-dedup\",\n",
"\n",
" \"lm1-4b2-84b-c4-repetitions/4b284b84bc4\",\n",
" \"lm1-4b2-84b-c4seeds/4b284b84bc4seed1\",\n",
" \"lm1-4b2-84b-c4seeds/4b284b84bc4seed2\",\n",
" \"lm1-4b2-84b-c4seeds/4b284b84bc4seed3\",\n",
" \"lm1-4b2-84b-c4seeds/4b284b84bc4seed4\",\n",
" \"lm1-4b2-84b-c4-dedup/4b284bc4dedup\",\n",
"\n",
" \"lm1-2b8-55b-oscar-repetitions/2b855b55boscar\",\n",
" \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed1\",\n",
" \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed2\",\n",
" \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed3\",\n",
" \"lm1-2b8-55b-oscarseeds/2b855b55boscarseed4\",\n",
" \"lm1-2b8-55b-oscardedup25/\",\n",
" \"lm1-2b8-55b-oscarroots\",\n",
"\n",
" \"lm1-4b2-84b-oscar-repetitions/4b284b84boscar\",\n",
" \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed1\",\n",
" \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed2\",\n",
" \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed3\",\n",
" \"lm1-4b2-84b-oscarseeds/4b284b84boscarseed4\",\n",
" \"lm1-4b2-84b-oscar-dedup/4b284b84boscardedup25expanded\",\n",
" \"lm1-4b2-84b-oscarroots\",\n",
"]\n",
"\n",
"\n",
"MODEL_TO_FEWSHOT_SCORES = {}\n",
"MODEL_TO_FEWSHOT_SCORES_ACC = {}\n",
"MODEL_TO_FEWSHOT_SCORES_GEN = {}\n",
"\n",
"SHOTS = list(range(6))\n",
"\n",
"for MODEL in MODELS:\n",
" MODEL_TO_FEWSHOT_SCORES.setdefault(MODEL, {})\n",
" MODEL_TO_FEWSHOT_SCORES_ACC.setdefault(MODEL, {})\n",
" MODEL_TO_FEWSHOT_SCORES_GEN.setdefault(MODEL, {})\n",
"\n",
" path = f\"{MODEL}/evaluation/generation/merged.csv\"\n",
" if not os.path.exists(path):\n",
" print(\"Skipping: \", {path})\n",
" continue\n",
" generation = pd.read_csv(path)\n",
" for SHOT in SHOTS: \n",
" \n",
" MODEL_TO_FEWSHOT_SCORES[MODEL].setdefault(SHOT, {})\n",
" MODEL_TO_FEWSHOT_SCORES_ACC[MODEL].setdefault(SHOT, {})\n",
" MODEL_TO_FEWSHOT_SCORES_GEN[MODEL].setdefault(SHOT, {})\n",
"\n",
" rankeval_files = glob.glob(f\"{MODEL}/evaluation/rankeval/*_{SHOT}.csv\")\n",
" assert len(rankeval_files) == 1, f\"{rankeval_files}\"\n",
" rankeval = pd.read_csv(rankeval_files[0])\n",
"\n",
" # Rescale to 0 - 1, where 0 is random performance\n",
" rankeval[\"normalized\"] = rankeval.apply(lambda x: (x[\"value\"] - TASK_TO_BASELINE[x[\"task\"]]) / (1 - TASK_TO_BASELINE[x[\"task\"]]), axis=1)\n",
" rankeval = rankeval[rankeval[\"metric\"] == \"acc\"]\n",
" rankeval_scores = rankeval.normalized.values.tolist()\n",
"\n",
" gen_sub = generation[generation[\"fewshots\"] == SHOT]\n",
" gen_sub = gen_sub[gen_sub[\"prompt\"] != \"median\"]\n",
" gen_sub = gen_sub[gen_sub[\"prompt\"] != \"average\"]\n",
"\n",
" # 0 is already random performance, i.e. no rescaling necessary\n",
" generation_scores = gen_sub.value.values.tolist()\n",
" scores = rankeval_scores + generation_scores\n",
" average_score = sum(scores) / len(scores)\n",
"\n",
" for i, row in rankeval.iterrows():\n",
" MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][row[\"task\"]] = row[\"normalized\"]\n",
" for i, row in gen_sub.iterrows():\n",
" MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][row[\"dataset\"]] = row[\"value\"]\n",
"\n",
" babi_files = glob.glob(f\"{MODEL}/evaluation/*{SHOT}_babi.json\")\n",
" if len(babi_files) == 1:\n",
" import json\n",
" with open(babi_files[0], \"r\") as f:\n",
" MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][\"babi\"] = json.load(f)[\"results\"][\"babi\"][\"em\"]\n",
" else:\n",
" print(f\"Missing bAbI: {MODEL}; Setting to 0 (which is OK for C4)\")\n",
" MODEL_TO_FEWSHOT_SCORES[MODEL][SHOT][\"babi\"] = 0\n",
"\n",
" #MODEL_TO_FEWSHOT_SCORES[BASE_MODEL][MODEL][SHOT]\n",
" #MODEL_TO_FEWSHOT_SCORES_ACC[BASE_MODEL][MODEL].append(sum(rankeval_scores) / len(rankeval_scores))\n",
" #MODEL_TO_FEWSHOT_SCORES_GEN[BASE_MODEL][MODEL].append(sum(generation_scores) / len(generation_scores))\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nKLZFSCWXnHN",
"outputId": "4f6f4dbc-db43-4304-a6aa-69887bd01bdc"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"fatal: destination path 'lm1-2b8-55b-dedup' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-c4-repetitions' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-c4seeds' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-c4-dedup' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-c4-repetitions' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-c4seeds' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-oscardedup25' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-oscar-repetitions' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-oscarseeds' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-oscar-dedup' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-oscar-repetitions' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-oscarseeds' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-2b8-55b-oscarroots' already exists and is not an empty directory.\n",
"fatal: destination path 'lm1-4b2-84b-oscarroots' already exists and is not an empty directory.\n",
"Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-2b8-55b-dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-repetitions/4b284b84bc4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed1; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed2; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed3; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4seeds/4b284b84bc4seed4; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n",
"Missing bAbI: lm1-4b2-84b-c4-dedup/4b284bc4dedup; Setting to 0 (which is OK for C4)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"### Latex Table ###\n",
"\n",
"import numpy as np\n",
"\n",
"TASK_TO_NAME = {\n",
" \"anli_r1\": \"ANLI R1\",\n",
" \"anli_r2\": \"ANLI R2\",\n",
" \"anli_r3\": \"ANLI R3\",\n",
" \"arc_challenge\": \"ARC-Challenge\",\n",
" \"arc_easy\": \"ARC-Easy\",\n",
" \"boolq\": \"BoolQ\",\n",
" \"cb\": \"CB\",\n",
" \"copa\": \"COPA\",\n",
" \"hellaswag\": \"HellaSwag\",\n",
" \"piqa\": \"PiQA\",\n",
" \"rte\": \"RTE\",\n",
" \"sciq\": \"SciQ\",\n",
" \"storycloze_2016\": \"StoryCloze 2016\",\n",
" \"winogrande\": \"WinoGrande XL\",\n",
"\n",
" \"e2e_nlg_cleaned\": \"E2E NLG\",\n",
" \"gem_xsum\": \"XSUM\",\n",
" \"web_nlg_en\": \"WebNLG EN\",\n",
" \"wiki_lingua_en\": \"WikiLingua EN\",\n",
"\n",
" \"babi\": \"bAbI\", \n",
"}\n",
"\n",
"### Table 1: OSCAR + C4 2b8\n",
"average = True\n",
"SHOT = 5\n",
"\n",
"TABLE = \"\"\"\n",
"\\\\begin{table}\n",
" \\\\centering\n",
" \\\\caption{\\textbf{Results after filtering with deduplication.}}\n",
" \\\\resizebox{\\\\textwidth}{!}{\n",
" \\\\begin{tabular}{l|llll|llll}\n",
" \\toprule \n",
"\"\"\"\n",
"\n",
"HEADER_A = \"Training Data & \\multicolumn{4}{c|}{C4} & \\multicolumn{4}{c}{OSCAR}\" + \" \\\\\\\\\"\n",
"HEADER_B = \"Parameters & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters} & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters}\" + \" \\\\\\\\\"\n",
"HEADER_C = \"Dataset & All & Dedup & All & Dedup & All & Dedup & All & Dedup\" + \" \\\\\\\\\"\n",
"\n",
"TABLE += \"\\n\\\\midrule\\n\".join([HEADER_A, HEADER_B, HEADER_C])\n",
"\n",
"for task in TASK_TO_NAME:\n",
" scores_str = []\n",
" for model in MODELS:\n",
" if \"seed\" in model: continue\n",
" if (\"dedup\" not in model) and (\"roots\" not in model):\n",
" model_scores = []\n",
" modelseeds = model.replace(\"-repetitions\", \"seeds\")\n",
" for model_name in [model+\"\", modelseeds+\"seed1\", modelseeds+\"seed2\", modelseeds+\"seed3\", modelseeds+\"seed4\"]:\n",
" model_scores.append(sum([MODEL_TO_FEWSHOT_SCORES[model_name][SHOT][task] for SHOT in [0, 1, 2, 3, 4, 5]]) / 6)\n",
" avg = np.mean(model_scores)\n",
" std = np.std(model_scores)\n",
" scores_str.append(f\"{round(avg * 100, 1)} ± {str(round(std * 100, 1))}\")\n",
" \n",
" else:\n",
" avg = sum([MODEL_TO_FEWSHOT_SCORES[model][SHOT][task] for SHOT in [0, 1, 2, 3, 4, 5]]) / 6\n",
" scores_str.append(str(round(avg * 100, 1)))\n",
" \n",
" TABLE += \"\\n\" + f\"{TASK_TO_NAME[task]} & \" + \" & \".join(scores_str) + \" \\\\\\\\\"\n",
"\n",
" if task in (\"winogrande\", \"wiki_lingua_en\", \"babi\"):\n",
" TABLE += \"\\n\" + \"\\\\midrule\"\n",
"\n",
"\n",
"# Add average\n",
"scores_avg_str = []\n",
"for model in MODELS:\n",
" if \"seed\" in model: continue\n",
" if (\"dedup\" not in model) and (\"roots\" not in model):\n",
" model_scores = []\n",
" modelseeds = model.replace(\"-repetitions\", \"seeds\")\n",
" for model_name in [model+\"\", modelseeds+\"seed1\", modelseeds+\"seed2\", modelseeds+\"seed3\", modelseeds+\"seed4\"]:\n",
" scores = [list(MODEL_TO_FEWSHOT_SCORES[model_name][SHOT].values()) for SHOT in [0, 1, 2, 3, 4, 5]]\n",
" for task_scores in scores:\n",
" assert len(task_scores) == len(TASK_TO_NAME) \n",
" model_scores.append(\n",
" np.mean([sub_score for shot_scores in scores for sub_score in shot_scores])\n",
" )\n",
" avg = np.mean(model_scores)\n",
" std = np.std(model_scores)\n",
" scores_avg_str.append(f\"{round(avg * 100, 1)} ± {str(round(std * 100, 1))}\")\n",
"\n",
" else:\n",
" scores = [list(MODEL_TO_FEWSHOT_SCORES[model][SHOT].values()) for SHOT in [0, 1, 2, 3, 4, 5]]\n",
" for task_scores in scores:\n",
" assert len(task_scores) == len(TASK_TO_NAME)\n",
" score_avg = [sub_score for shot_scores in scores for sub_score in shot_scores]\n",
" avg = sum(score_avg) / len(score_avg)\n",
" scores_avg_str.append(f\"{round(avg * 100, 1)}\")\n",
"\n",
"TABLE += \"\\nAverage & \" + \" & \".join(scores_avg_str) + \" \\\\\\\\\"\n",
"TABLE += \"\\n\\\\bottomrule\"\n",
"\n",
"TABLE += \"\"\"\n",
"\\bottomrule\n",
" \\end{tabular}\n",
" }\n",
" \\label{tab:dedup}\n",
"\\end{table}\n",
"\"\"\"\n",
"\n",
"print(TABLE)"
],
"metadata": {
"id": "gy-vQlPyZG7c",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "85464b57-c629-495b-c8b2-8a38de221c65"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"\\begin{table}\n",
" \\centering\n",
" \\caption{\textbf{Results after filtering with deduplication.}}\n",
" \\resizebox{\\textwidth}{!}{\n",
" \\begin{tabular}{l|llll|llll}\n",
" \toprule \n",
"Training Data & \\multicolumn{4}{c|}{C4} & \\multicolumn{4}{c}{OSCAR} \\\\\n",
"\\midrule\n",
"Parameters & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters} & \\multicolumn{2}{c|}{2.8B parameters} & \\multicolumn{2}{c|}{4.2B parameters} \\\\\n",
"\\midrule\n",
"Dataset & All & Dedup & All & Dedup & All & Dedup & All & Dedup \\\\\n",
"ANLI R1 & 0.4 ± 1.6 & -0.2 & -0.5 ± 1.4 & -0.8 & -0.3 ± 0.5 & -2.1 & -1.7 & -0.4 ± 1.2 & -1.8 & 1.2 \\\\\n",
"ANLI R2 & 0.9 ± 0.4 & 1.1 & 0.0 ± 1.3 & -0.1 & 1.0 ± 1.0 & 2.0 & 0.7 & 1.0 ± 0.9 & -0.5 & -0.3 \\\\\n",
"ANLI R3 & 1.7 ± 0.5 & 1.8 & 0.7 ± 0.5 & 0.4 & 0.4 ± 0.8 & 0.4 & 0.2 & 1.2 ± 0.5 & 0.8 & -0.3 \\\\\n",
"ARC-Challenge & 1.6 ± 1.0 & 0.6 & 4.2 ± 1.6 & 3.9 & -1.4 ± 0.8 & 2.6 & -0.9 & 1.8 ± 0.8 & 6.8 & 0.6 \\\\\n",
"ARC-Easy & 44.5 ± 0.5 & 43.0 & 48.1 ± 4.8 & 46.8 & 39.7 ± 0.3 & 44.6 & 42.3 & 45.7 ± 0.6 & 51.0 & 47.1 \\\\\n",
"BoolQ & 18.8 ± 3.4 & 1.5 & 22.4 ± 3.3 & 2.2 & 12.8 ± 4.4 & 3.4 & 13.4 & 12.4 ± 5.9 & 13.0 & 7.0 \\\\\n",
"CB & 20.0 ± 4.7 & 0.4 & 9.3 ± 16.6 & 0.9 & 19.7 ± 5.1 & 25.4 & 14.3 & 23.9 ± 3.8 & 25.0 & 28.1 \\\\\n",
"COPA & 49.7 ± 3.5 & 57.0 & 55.3 ± 3.8 & 60.0 & 42.7 ± 2.2 & 47.3 & 37.7 & 41.1 ± 3.0 & 55.3 & 43.0 \\\\\n",
"HellaSwag & 24.7 ± 0.3 & 25.1 & 29.4 ± 1.3 & 30.7 & 16.3 ± 0.1 & 22.8 & 17.6 & 21.0 ± 0.2 & 26.3 & 22.4 \\\\\n",
"PiQA & 47.9 ± 0.6 & 49.1 & 48.8 ± 3.8 & 53.4 & 41.2 ± 0.7 & 45.1 & 41.9 & 45.0 ± 0.6 & 48.5 & 46.3 \\\\\n",
"RTE & 5.1 ± 4.0 & 3.2 & 6.9 ± 3.1 & 0.1 & 3.9 ± 1.1 & 6.1 & 5.8 & 2.2 ± 4.3 & 1.1 & 8.9 \\\\\n",
"SciQ & 83.2 ± 0.6 & 80.4 & 86.3 ± 1.1 & 82.2 & 83.2 ± 0.6 & 82.6 & 83.1 & 86.3 ± 0.6 & 88.5 & 86.4 \\\\\n",
"StoryCloze 2016 & 58.7 ± 0.2 & 61.8 & 62.8 ± 0.5 & 65.2 & 52.8 ± 0.3 & 58.1 & 54.3 & 57.2 ± 0.6 & 61.6 & 58.6 \\\\\n",
"WinoGrande XL & 11.6 ± 0.8 & 13.3 & 18.7 ± 1.0 & 19.7 & 5.8 ± 0.9 & 12.7 & 5.6 & 10.1 ± 1.0 & 16.2 & 11.0 \\\\\n",
"\\midrule\n",
"E2E NLG & 17.0 ± 1.4 & 15.6 & 17.9 ± 0.7 & 14.2 & 20.3 ± 0.3 & 20.5 & 20.5 & 21.6 ± 0.7 & 2.4 & 22.6 \\\\\n",
"XSUM & 2.4 ± 0.1 & 2.1 & 3.0 ± 0.3 & 2.5 & 3.0 ± 0.1 & 3.2 & 3.1 & 3.7 ± 0.2 & 4.6 & 3.8 \\\\\n",
"WebNLG EN & 5.3 ± 0.1 & 4.3 & 5.6 ± 0.3 & 4.4 & 8.8 ± 0.4 & 7.4 & 7.4 & 9.3 ± 0.5 & 9.7 & 9.4 \\\\\n",
"WikiLingua EN & 3.0 ± 0.1 & 3.2 & 3.6 ± 0.2 & 3.2 & 2.9 ± 0.1 & 3.0 & 3.1 & 4.0 ± 0.1 & 4.3 & 4.0 \\\\\n",
"\\midrule\n",
"bAbI & 0.0 ± 0.0 & 0.0 & 0.0 ± 0.0 & 0.0 & 15.5 ± 1.0 & 17.2 & 14.3 & 19.3 ± 1.0 & 21.1 & 18.0 \\\\\n",
"\\midrule\n",
"Average & 20.9 ± 0.4 & 19.1 & 22.2 ± 1.4 & 20.5 & 19.4 ± 0.5 & 21.2 & 19.1 & 21.4 ± 0.5 & 22.8 & 22.0 \\\\\n",
"\\bottomrule\n",
"\bottomrule\n",
" \\end{tabular}\n",
" }\n",
" \\label{tab:dedup}\n",
"\\end{table}\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "9PizHEDmZBpO"
},
"execution_count": null,
"outputs": []
}
]
}