course/videos/push_to_hub_old.ipynb (570 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form"
},
"outputs": [
{
"data": {
"text/html": [
"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/Z1-XMy-GNLQ?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#@title\n",
"from IPython.display import HTML\n",
"\n",
"HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/Z1-XMy-GNLQ?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the Transformers and Datasets libraries to run this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install datasets transformers[sentencepiece]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from datasets import load_dataset, load_metric\n",
"from transformers import (\n",
" AutoModelForSequenceClassification,\n",
" AutoTokenizer,\n",
" DataCollatorWithPadding,\n",
" Trainer,\n",
" TrainingArguments,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"checkpoint = \"bert-base-cased\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Reusing dataset glue (/home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
"Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-8174fd92eed0af98.arrow\n",
"Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-8c99fb059544bc96.arrow\n",
"Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e625eb72bcf1ae1f.arrow\n",
"Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']\n",
"- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"\n",
"def tokenize_function(examples):\n",
" return tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True)\n",
"\n",
"tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
"\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
"\n",
"training_args = TrainingArguments(\n",
" \"finetuned-bert\",\n",
" per_device_train_batch_size=16,\n",
" per_device_eval_batch_size=16,\n",
" learning_rate=2e-5,\n",
" weight_decay=0.01,\n",
" evaluation_strategy=\"epoch\",\n",
" logging_strategy=\"epoch\",\n",
")\n",
"\n",
"data_collator = DataCollatorWithPadding(tokenizer)\n",
"\n",
"metric = load_metric(\"glue\", \"mrpc\")\n",
"\n",
"def compute_metrics(eval_preds):\n",
" logits, labels = eval_preds\n",
" predictions = np.argmax(logits, axis=-1)\n",
" return metric.compute(predictions=predictions, references=labels)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"trainer = Trainer(\n",
" model,\n",
" training_args,\n",
" train_dataset=tokenized_datasets[\"train\"],\n",
" eval_dataset=tokenized_datasets[\"validation\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='690' max='690' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [690/690 01:13, Epoch 3/3]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>Epoch</th>\n",
" <th>Training Loss</th>\n",
" <th>Validation Loss</th>\n",
" <th>Accuracy</th>\n",
" <th>F1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.573500</td>\n",
" <td>0.475627</td>\n",
" <td>0.774510</td>\n",
" <td>0.835714</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.383800</td>\n",
" <td>0.452076</td>\n",
" <td>0.821078</td>\n",
" <td>0.878939</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.233600</td>\n",
" <td>0.485343</td>\n",
" <td>0.850490</td>\n",
" <td>0.897133</td>\n",
" </tr>\n",
" </tbody>\n",
"</table><p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"TrainOutput(global_step=690, training_loss=0.39697496718254643, metrics={'train_runtime': 74.2937, 'train_samples_per_second': 148.115, 'train_steps_per_second': 9.287, 'total_flos': 563360051116800.0, 'train_loss': 0.39697496718254643, 'epoch': 3.0})"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainer.train()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Push to hub from the Trainer directly"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will need an authentication token with your Hugging Face credentials to use the `push_to_hub` method. Execute `huggingface-cli login` in your terminal or by uncommenting the following cell:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !huggingface-cli login"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `Trainer` has a new method to directly upload the model, tokenizer and model configuration in a repo on the [Hub](https://huggingface.co/). It will even auto-generate a model card draft using the hyperparameters and evaluation results!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://huggingface.co/sgugger/finetuned-bert/commit/12c9aaaadf60e2c48e9419524f3170f445a120d6'"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainer.push_to_hub(repo_name=\"finetuned-bert\")\n",
"# trainer.push_to_hub(repo_name=\"finetuned-bert\", organization=\"huggingface\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you are using your own training loop, you can push the model and tokenizer separately (and you will have to write the model card yourself):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# model.push_to_hub(\"finetuned-bert\")\n",
"# tokenizer.push_to_hub(\"finetuned-bert\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## You can load your model from anywhere using from_pretrained!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "533ace86937d46e2bf6e98452fa786aa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=671.0, style=ProgressStyle(description_…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d5600c7616cc44a0a3cf767fe0612bb0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433336585.0, style=ProgressStyle(descri…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"from transformers import AutoModelForSequenceClassification\n",
"\n",
"model_name = \"sgugger/finetuned-bert\"\n",
"model = AutoModelForSequenceClassification.from_pretrained(model_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## You can use your model in a pipeline!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "222535481b2d4ded93f597311e7ec283",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bccaf4cf945848e3863b984ed8a18a71",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435816.0, style=ProgressStyle(descripti…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f7aa6d01720542458c454f7f95d2b881",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "00bbc6ae04a14273a93b8613e75d17e6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=284.0, style=ProgressStyle(description_…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"from transformers import pipeline\n",
"\n",
"classifier = pipeline(\"text-classification\", model=model_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'label': 'LABEL_0', 'score': 0.7789641618728638}]"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier(\"My name is Sylvain. [SEP] My name is Lysandre\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Updating a problematic file is super easy!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.config.label2id = {\"not equivalent\": 0, \"equivalent\": 1}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.config.id2label = {0: \"not equivalent\", 1: \"equivalent\"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.config.push_to_hub(\"finetuned-bert\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "09d7cee3537f4385b11dd3d647abca15",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=814.0, style=ProgressStyle(description_…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"text/plain": [
"[{'label': 'not equivalent', 'score': 0.7789641618728638}]"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier = pipeline(\"text-classification\", model=model_name)\n",
"\n",
"classifier(\"My name is Sylvain. [SEP] My name is Lysandre\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"name": "The push to hub API",
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 4
}