course/fr/chapter7/section3_pt.ipynb (770 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "uVoMtekMk-8i"
},
"source": [
"# Finetuner un modèle de language masqué (PyTorch)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "97UMn_IHk-8n"
},
"source": [
"Installez les bibliothèques 🤗 *Datasets*, 🤗 *Transformers* et 🤗 *Accelerate* pour exécuter ce *notebook*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JTP7moZJk-8r"
},
"outputs": [],
"source": [
"!pip install datasets transformers[sentencepiece]\n",
"!pip install accelerate\n",
"# Pour exécuter l'entraînement sur TPU, vous devez décommenter la ligne suivante :\n",
"# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl\n",
"!apt install git-lfs"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "W8b-VVo2k-8w"
},
"source": [
"Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nKcOiNkCk-8z"
},
"outputs": [],
"source": [
"!git config --global user.email \"you@example.com\"\n",
"!git config --global user.name \"Your Name\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kDYcdgKqk-81"
},
"source": [
"Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FaqLiMYJk-85"
},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "bnwlqB03k-87"
},
"outputs": [],
"source": [
"from transformers import AutoModelForMaskedLM\n",
"\n",
"model_checkpoint = \"camembert-base\"\n",
"model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "bREfPqNMk-89"
},
"outputs": [],
"source": [
"text = \"C'est une grande <mask>.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "5EjKRk9Bk-9C"
},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "15KV-A67k-9E"
},
"outputs": [],
"source": [
"import torch\n",
"\n",
"inputs = tokenizer(text, return_tensors=\"pt\")\n",
"token_logits = model(**inputs).logits\n",
"# Trouver l'emplacement du <mask> et extraire ses logits\n",
"mask_token_index = torch.where(inputs[\"input_ids\"] == tokenizer.mask_token_id)[1]\n",
"mask_token_logits = token_logits[0, mask_token_index, :]\n",
"# Choisir les <mask> candidats avec les logits les plus élevés\n",
"top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()\n",
"\n",
"for token in top_5_tokens:\n",
" print(f\"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lQuMuSuwk-9J"
},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"\n",
"imdb_dataset = load_dataset(\"allocine\")\n",
"imdb_dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "svNO5I9lk-9M"
},
"outputs": [],
"source": [
"sample = imdb_dataset[\"train\"].shuffle(seed=42).select(range(3))\n",
"\n",
"for row in sample:\n",
" print(f\"\\n'>>> Review: {row['review']}'\")\n",
" print(f\"'>>> Label: {row['label']}'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "tm0x7w6Gk-9P"
},
"outputs": [],
"source": [
"def tokenize_function(examples):\n",
" result = tokenizer(examples[\"review\"])\n",
" if tokenizer.is_fast:\n",
" result[\"word_ids\"] = [result.word_ids(i) for i in range(len(result[\"input_ids\"]))]\n",
" return result\n",
"\n",
"\n",
"# Utilisez batched=True pour activer le multithreading rapide !\n",
"tokenized_datasets = imdb_dataset.map(\n",
" tokenize_function, batched=True, remove_columns=[\"review\", \"label\"]\n",
")\n",
"tokenized_datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "5zJvlXd2k-9R"
},
"outputs": [],
"source": [
"tokenizer.model_max_length"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "tk3TfASAk-9T"
},
"outputs": [],
"source": [
"chunk_size = 128"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "y0ZMbjwQk-9U"
},
"outputs": [],
"source": [
"# Le découpage produit une liste de listes pour chaque caractéristique\n",
"tokenized_samples = tokenized_datasets[\"train\"][:3]\n",
"\n",
"for idx, sample in enumerate(tokenized_samples[\"input_ids\"]):\n",
" print(f\"'>>> Review {idx} length: {len(sample)}'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Mg2PlCl8k-9X"
},
"outputs": [],
"source": [
"concatenated_examples = {\n",
" k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()\n",
"}\n",
"total_length = len(concatenated_examples[\"input_ids\"])\n",
"print(f\"'>>> Concatenated reviews length: {total_length}'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "BU_uR_dLk-9Y"
},
"outputs": [],
"source": [
"chunks = {\n",
" k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]\n",
" for k, t in concatenated_examples.items()\n",
"}\n",
"\n",
"for chunk in chunks[\"input_ids\"]:\n",
" print(f\"'>>> Chunk length: {len(chunk)}'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Y9Fwylomk-9b"
},
"outputs": [],
"source": [
"def group_texts(examples):\n",
" # Concaténation de tous les textes\n",
" concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
" # Calculer la longueur des textes concaténés\n",
" total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
" # Nous laissons tomber le dernier morceau s'il est plus petit que chunk_size\n",
" total_length = (total_length // chunk_size) * chunk_size\n",
" # Fractionnement par morceaux de max_len\n",
" result = {\n",
" k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]\n",
" for k, t in concatenated_examples.items()\n",
" }\n",
" # Créer une nouvelle colonne d'étiquettes\n",
" result[\"labels\"] = result[\"input_ids\"].copy()\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0Ez61wwAk-9e"
},
"outputs": [],
"source": [
"lm_datasets = tokenized_datasets.map(group_texts, batched=True)\n",
"lm_datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "TPeyRGujk-9f"
},
"outputs": [],
"source": [
"tokenizer.decode(lm_datasets[\"train\"][1][\"input_ids\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "8Qz1pwNAk-9h"
},
"outputs": [],
"source": [
"from transformers import DataCollatorForLanguageModeling\n",
"\n",
"data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "sQIG43glk-9j"
},
"outputs": [],
"source": [
"samples = [lm_datasets[\"train\"][i] for i in range(2)]\n",
"for sample in samples:\n",
" _ = sample.pop(\"word_ids\")\n",
"\n",
"for chunk in data_collator(samples)[\"input_ids\"]:\n",
" print(f\"\\n'>>> {tokenizer.decode(chunk)}'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YHsPa9_Uk-9k"
},
"outputs": [],
"source": [
"import collections\n",
"import numpy as np\n",
"\n",
"from transformers import default_data_collator\n",
"\n",
"wwm_probability = 0.2\n",
"\n",
"\n",
"def whole_word_masking_data_collator(features):\n",
" for feature in features:\n",
" word_ids = feature.pop(\"word_ids\")\n",
"\n",
" # Création d'une correspondance entre les mots et les indices des tokens correspondants\n",
" mapping = collections.defaultdict(list)\n",
" current_word_index = -1\n",
" current_word = None\n",
" for idx, word_id in enumerate(word_ids):\n",
" if word_id is not None:\n",
" if word_id != current_word:\n",
" current_word = word_id\n",
" current_word_index += 1\n",
" mapping[current_word_index].append(idx)\n",
"\n",
" # Masquer des mots de façon aléatoire\n",
" mask = np.random.binomial(1, wwm_probability, (len(mapping),))\n",
" input_ids = feature[\"input_ids\"]\n",
" labels = feature[\"labels\"]\n",
" new_labels = [-100] * len(labels)\n",
" for word_id in np.where(mask)[0]:\n",
" word_id = word_id.item()\n",
" for idx in mapping[word_id]:\n",
" new_labels[idx] = labels[idx]\n",
" input_ids[idx] = tokenizer.mask_token_id\n",
" feature[\"labels\"] = new_labels\n",
"\n",
" return default_data_collator(features)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "p3CVF4egk-9n"
},
"outputs": [],
"source": [
"samples = [lm_datasets[\"train\"][i] for i in range(2)]\n",
"batch = whole_word_masking_data_collator(samples)\n",
"\n",
"for chunk in batch[\"input_ids\"]:\n",
" print(f\"\\n'>>> {tokenizer.decode(chunk)}'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Z2Ez4ONCk-9o"
},
"outputs": [],
"source": [
"train_size = 10_000\n",
"test_size = int(0.1 * train_size)\n",
"\n",
"downsampled_dataset = lm_datasets[\"train\"].train_test_split(\n",
" train_size=train_size, test_size=test_size, seed=42\n",
")\n",
"downsampled_dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "IUO-qIllk-9r"
},
"outputs": [],
"source": [
"from transformers import TrainingArguments\n",
"\n",
"batch_size = 64\n",
"# Montrer la perte d'entraînement à chaque époque\n",
"logging_steps = len(downsampled_dataset[\"train\"]) // batch_size\n",
"model_name = model_checkpoint.split(\"/\")[-1]\n",
"\n",
"training_args = TrainingArguments(\n",
" output_dir=f\"{model_name}-finetuned-allocine\",\n",
" overwrite_output_dir=True,\n",
" evaluation_strategy=\"epoch\",\n",
" learning_rate=2e-5,\n",
" weight_decay=0.01,\n",
" per_device_train_batch_size=batch_size,\n",
" per_device_eval_batch_size=batch_size,\n",
" push_to_hub=True,\n",
" fp16=True,\n",
" logging_steps=logging_steps,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ipKYzt6sk-9t"
},
"outputs": [],
"source": [
"from transformers import Trainer\n",
"\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=downsampled_dataset[\"train\"],\n",
" eval_dataset=downsampled_dataset[\"test\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ruFr2J1sk-9u"
},
"outputs": [],
"source": [
"import math\n",
"\n",
"eval_results = trainer.evaluate()\n",
"print(f\">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "-fuBwvqek-9w"
},
"outputs": [],
"source": [
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "22r8rDYTk-9x"
},
"outputs": [],
"source": [
"eval_results = trainer.evaluate()\n",
"print(f\">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "cv0CwVamk-9z"
},
"outputs": [],
"source": [
"trainer.push_to_hub()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "-GRfjc37k-91"
},
"outputs": [],
"source": [
"def insert_random_mask(batch):\n",
" features = [dict(zip(batch, t)) for t in zip(*batch.values())]\n",
" masked_inputs = data_collator(features)\n",
" # Créer une nouvelle colonne \"masquée\" pour chaque colonne du jeu de données\n",
" return {\"masked_\" + k: v.numpy() for k, v in masked_inputs.items()}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "RZbvYGpBk-91"
},
"outputs": [],
"source": [
"downsampled_dataset = downsampled_dataset.remove_columns([\"word_ids\"])\n",
"eval_dataset = downsampled_dataset[\"test\"].map(\n",
" insert_random_mask,\n",
" batched=True,\n",
" remove_columns=downsampled_dataset[\"test\"].column_names,\n",
")\n",
"eval_dataset = eval_dataset.rename_columns(\n",
" {\n",
" \"masked_input_ids\": \"input_ids\",\n",
" \"masked_attention_mask\": \"attention_mask\",\n",
" \"masked_labels\": \"labels\",\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "B953lLCEk-93"
},
"outputs": [],
"source": [
"from torch.utils.data import DataLoader\n",
"from transformers import default_data_collator\n",
"\n",
"batch_size = 64\n",
"train_dataloader = DataLoader(\n",
" downsampled_dataset[\"train\"],\n",
" shuffle=True,\n",
" batch_size=batch_size,\n",
" collate_fn=data_collator,\n",
")\n",
"eval_dataloader = DataLoader(\n",
" eval_dataset, batch_size=batch_size, collate_fn=default_data_collator\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qk987sI3k-95"
},
"outputs": [],
"source": [
"from torch.optim import AdamW\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=5e-5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "mgKQR4Aok-96"
},
"outputs": [],
"source": [
"from accelerate import Accelerator\n",
"\n",
"accelerator = Accelerator()\n",
"model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(\n",
" model, optimizer, train_dataloader, eval_dataloader\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "p7iQC9nZk-97"
},
"outputs": [],
"source": [
"from transformers import get_scheduler\n",
"\n",
"num_train_epochs = 3\n",
"num_update_steps_per_epoch = len(train_dataloader)\n",
"num_training_steps = num_train_epochs * num_update_steps_per_epoch\n",
"\n",
"lr_scheduler = get_scheduler(\n",
" \"linear\",\n",
" optimizer=optimizer,\n",
" num_warmup_steps=0,\n",
" num_training_steps=num_training_steps,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "3b8pk5NRk-98"
},
"outputs": [],
"source": [
"from huggingface_hub import get_full_repo_name\n",
"\n",
"model_name = \"camembert-base-finetuned-allocine-accelerate\"\n",
"repo_name = get_full_repo_name(model_name)\n",
"repo_name"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0dPTV7r6k-9-"
},
"outputs": [],
"source": [
"from huggingface_hub import Repository\n",
"\n",
"output_dir = model_name\n",
"repo = Repository(output_dir, clone_from=repo_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "mekFif4nk-9_"
},
"outputs": [],
"source": [
"from tqdm.auto import tqdm\n",
"import torch\n",
"import math\n",
"\n",
"progress_bar = tqdm(range(num_training_steps))\n",
"\n",
"for epoch in range(num_train_epochs):\n",
" # Entraînement\n",
" model.train()\n",
" for batch in train_dataloader:\n",
" outputs = model(**batch)\n",
" loss = outputs.loss\n",
" accelerator.backward(loss)\n",
"\n",
" optimizer.step()\n",
" lr_scheduler.step()\n",
" optimizer.zero_grad()\n",
" progress_bar.update(1)\n",
"\n",
" # Evaluation\n",
" model.eval()\n",
" losses = []\n",
" for step, batch in enumerate(eval_dataloader):\n",
" with torch.no_grad():\n",
" outputs = model(**batch)\n",
"\n",
" loss = outputs.loss\n",
" losses.append(accelerator.gather(loss.repeat(batch_size)))\n",
"\n",
" losses = torch.cat(losses)\n",
" losses = losses[: len(eval_dataset)]\n",
" try:\n",
" perplexity = math.exp(torch.mean(losses))\n",
" except OverflowError:\n",
" perplexity = float(\"inf\")\n",
"\n",
" print(f\">>> Epoch {epoch}: Perplexity: {perplexity}\")\n",
"\n",
" # Sauvegarder et télécharger\n",
" accelerator.wait_for_everyone()\n",
" unwrapped_model = accelerator.unwrap_model(model)\n",
" unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)\n",
" if accelerator.is_main_process:\n",
" tokenizer.save_pretrained(output_dir)\n",
" repo.push_to_hub(\n",
" commit_message=f\"Training in progress epoch {epoch}\", blocking=False\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qpGfXsN3k--B"
},
"outputs": [],
"source": [
"from transformers import pipeline\n",
"\n",
"mask_filler = pipeline(\n",
" \"fill-mask\", model=\"huggingface-course/camembert-base-finetuned-allocine\", tokenizer=\"huggingface-course/camembert-base-finetuned-allocine\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "q0EDeIUJk--C"
},
"outputs": [],
"source": [
"preds = mask_filler(text)\n",
"\n",
"for pred in preds:\n",
" print(f\">>> {pred['sequence']}\")"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15 (main, Oct 11 2022, 22:27:25) \n[Clang 14.0.0 (clang-1400.0.29.102)]"
},
"vscode": {
"interpreter": {
"hash": "397704579725e15f5c7cb49fe5f0341eb7531c82d19f2c29d197e8b64ab5776b"
}
}
},
"nbformat": 4,
"nbformat_minor": 1
}