course/videos/qa_processing.ipynb (224 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form"
},
"outputs": [
{
"data": {
"text/html": [
"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/qgaM0weJHpA?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#@title\n",
"from IPython.display import HTML\n",
"\n",
"HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/qgaM0weJHpA?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the Transformers and Datasets libraries to run this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install datasets transformers[sentencepiece]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"\n",
"raw_datasets = load_dataset(\"squad\")\n",
"raw_datasets = raw_datasets.remove_columns([\"id\", \"title\"])\n",
"\n",
"def prepare_data(example):\n",
" answer = example[\"answers\"][\"text\"][0]\n",
" example[\"answer_start\"] = example[\"answers\"][\"answer_start\"][0]\n",
" example[\"answer_end\"] = example[\"answer_start\"] + len(answer)\n",
" return example\n",
"\n",
"raw_datasets = raw_datasets.map(prepare_data, remove_columns=[\"answers\"])\n",
"raw_datasets[\"train\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Context: {raw_datasets['train'][0]['context']\")\n",
"print(f\"Question: {raw_datasets['train'][0]['question']\")\n",
"start = raw_datasets[\"train\"][0][\"answer_start\"]\n",
"end = raw_datasets[\"train\"][0][\"answer_end\"]\n",
"print(f\"\\nAnswer: {raw_datasets['train'][0]['context'][start:end]}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"model_checkpoint = \"bert-base-cased\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
"\n",
"example = raw_datasets[\"train\"][0]\n",
"inputs = tokenizer(\n",
" example[\"question\"],\n",
" example[\"context\"],\n",
" truncation=\"only_second\",\n",
" padding=\"max_length\",\n",
" max_length=384,\n",
" stride=128,\n",
" return_overflowing_tokens=True,\n",
" return_offsets_mapping=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def find_labels(offsets, answer_start, answer_end, sequence_ids):\n",
" idx = 0\n",
" while sequence_ids[idx] != 1:\n",
" idx += 1\n",
" context_start = idx\n",
" while sequence_ids[idx] == 1:\n",
" idx += 1\n",
" context_end = idx - 1\n",
"\n",
" # If the answer is not fully in the context, return (0, 0)\n",
" if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start:\n",
" return (0, 0)\n",
" else:\n",
" idx = context_start\n",
" while idx <= context_end and offsets[idx][0] <= answer_start:\n",
" idx += 1\n",
" start_position = idx - 1\n",
"\n",
" idx = context_end\n",
" while idx >= context_start and offsets[idx][1] >= answer_end:\n",
" idx -= 1\n",
" end_position = idx + 1\n",
" \n",
" return start_position, end_position"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"start, end = find_labels(\n",
" inputs[\"offset_mapping\"][0],\n",
" example[\"answer_start\"],\n",
" example[\"answer_end\"],\n",
" inputs.sequence_ids(0)\n",
")\n",
"tokenizer.decode(inputs[\"input_ids\"][0][start: end+1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_training_examples(examples):\n",
" questions = [q.strip() for q in examples[\"question\"]]\n",
" inputs = tokenizer(\n",
" examples[\"question\"],\n",
" examples[\"context\"],\n",
" truncation=\"only_second\",\n",
" padding=\"max_length\",\n",
" max_length=384,\n",
" stride=128,\n",
" return_overflowing_tokens=True,\n",
" return_offsets_mapping=True,\n",
" )\n",
" \n",
" offset_mapping = inputs.pop(\"offset_mapping\")\n",
" sample_map = inputs.pop(\"overflow_to_sample_mapping\")\n",
" inputs[\"start_positions\"] = []\n",
" inputs[\"end_positions\"] = []\n",
"\n",
" for i, offset in enumerate(offset_mapping):\n",
" sample_idx = sample_map[i]\n",
" start, end = find_labels(\n",
" offset, examples[\"answer_start\"][sample_idx], examples[\"answer_end\"][sample_idx], inputs.sequence_ids(i)\n",
" )\n",
" \n",
" inputs[\"start_positions\"].append(start)\n",
" inputs[\"end_positions\"].append(end)\n",
"\n",
" return inputs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenized_datasets = raw_datasets.map(\n",
" preprocess_training_examples,\n",
" batched=True,\n",
" remove_columns=raw_datasets[\"train\"].column_names,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"name": "Data processing for Question Answering",
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 4
}