notebooks/curate_history_data.ipynb (1,529 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "id": "9516d218-b894-43d6-ac36-6cb9babb694c", "metadata": {}, "source": [ "Goal of this notebook is to collect and curate some history like data for various languages\n", "```\n", "Source 1 -> Multilingual summarization dataset)[https://www.kaggle.com/datasets/thedevastator/mlsam-multilingual-summarization-dataset]\n", "\n", "Source 2 -> https://github.com/microsoft/msmarco/blob/master/Datasets.md\n", " Don't download this link (VERY HUGE) - msmarco-docs.tsv\n", " Have selected some 500k examples from this for exploration\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "id": "14f941c4-35b2-4c1b-aa86-be43df7a5d13", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import sys\n", "from tqdm import tqdm\n", "from urllib.parse import urlparse\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 2, "id": "5a88b688-8cdb-42c5-bb00-cf5580f763ab", "metadata": {}, "outputs": [], "source": [ "# Add the project root directory to the Python path\n", "project_root = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", "sys.path.append(project_root)" ] }, { "cell_type": "code", "execution_count": 3, "id": "66069a57-3549-40fa-a3d3-1525a699b979", "metadata": {}, "outputs": [], "source": [ "from src.constants import EMBEDDING_MODELS_DICT\n", "from src.feature_extractor import FeatureExtractor" ] }, { "cell_type": "code", "execution_count": 4, "id": "c0513beb-b94f-4fc1-a7ab-ebe6bc9031e1", "metadata": {}, "outputs": [], "source": [ "MLSAM_DATA_PATH = \"/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "d91f2a9b-1e85-4ec3-bb46-49f0a3899b63", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 9471984\n", "-rw-rw-r--@ 1 cgopal staff 46828273 Dec 2 2023 de_test.csv\n", "-rw-rw-r-- 1 cgopal staff 846124338 Dec 2 2023 de_train.csv\n", "-rw-rw-r--@ 1 cgopal staff 47087676 Dec 2 2023 de_validation.csv\n", "-rw-rw-r-- 1 cgopal staff 71117937 Dec 2 2023 es_test.csv\n", "-rw-rw-r-- 1 cgopal staff 1211703726 Dec 2 2023 es_train.csv\n", "-rw-rw-r-- 1 cgopal staff 50529088 Dec 2 2023 es_validation.csv\n", "-rw-rw-r--@ 1 cgopal staff 69447109 Dec 2 2023 fr_test.csv\n", "-rw-rw-r-- 1 cgopal staff 1468931669 Dec 2 2023 fr_train.csv\n", "-rw-rw-r-- 1 cgopal staff 70196782 Dec 2 2023 fr_validation.csv\n", "-rw-rw-r-- 1 cgopal staff 9646755 Dec 2 2023 ru_test.csv\n", "-rw-rw-r--@ 1 cgopal staff 257038607 Dec 2 2023 ru_train.csv\n", "-rw-rw-r-- 1 cgopal staff 27686809 Dec 2 2023 tu_test.csv\n", "-rw-rw-r--@ 1 cgopal staff 9119261 Dec 2 2023 ru_validation.csv\n", "-rw-rw-r-- 1 cgopal staff 638760165 Dec 2 2023 tu_train.csv\n", "-rw-rw-r-- 1 cgopal staff 25404303 Dec 2 2023 tu_validation.csv\n" ] } ], "source": [ "!ls -ltr $MLSAM_DATA_PATH" ] }, { "cell_type": "code", "execution_count": 6, "id": "9711f6a7-84a0-4c3c-8dc6-d36c1c78714a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 7%|██████▏ | 1/15 [00:06<01:37, 6.95s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/tu_train.csv -> 249277\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 13%|████████████▍ | 2/15 [00:07<00:40, 3.10s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/es_validation.csv -> 10358\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 27%|████████████████████████▊ | 4/15 [00:14<00:31, 2.87s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/de_train.csv -> 220887\n", "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/ru_test.csv -> 757\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 33%|███████████████████████████████ | 5/15 [00:24<00:55, 5.53s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/es_train.csv -> 266367\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 40%|█████████████████████████████████████▏ | 6/15 [00:24<00:34, 3.88s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/fr_test.csv -> 15828\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 47%|███████████████████████████████████████████▍ | 7/15 [00:25<00:21, 2.70s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/tu_test.csv -> 12775\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 53%|█████████████████████████████████████████████████▌ | 8/15 [00:25<00:13, 1.96s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/de_test.csv -> 10701\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 60%|███████████████████████████████████████████████████████▊ | 9/15 [00:25<00:08, 1.43s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/tu_validation.csv -> 11565\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 67%|█████████████████████████████████████████████████████████████▎ | 10/15 [00:26<00:05, 1.19s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/es_test.csv -> 13920\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 73%|███████████████████████████████████████████████████████████████████▍ | 11/15 [00:27<00:04, 1.03s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/fr_validation.csv -> 16059\n", "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/ru_validation.csv -> 750\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 87%|███████████████████████████████████████████████████████████████████████████████▋ | 13/15 [00:27<00:01, 1.52it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/de_validation.csv -> 11394\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 93%|█████████████████████████████████████████████████████████████████████████████████████▊ | 14/15 [00:41<00:03, 3.90s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/fr_train.csv -> 392902\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:43<00:00, 2.93s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/Users/cgopal/Downloads/mlsam-multilingual-summarization-dataset/ru_train.csv -> 25556\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "mlsam_data_list = []\n", "for fname in tqdm(os.listdir(MLSAM_DATA_PATH)):\n", " file_path = os.path.join(MLSAM_DATA_PATH, fname)\n", " df = pd.read_csv(file_path)\n", " print(file_path, \"->\", len(df))\n", " df['lang'] = fname[:2]\n", " mlsam_data_list.append(df)\n", " " ] }, { "cell_type": "code", "execution_count": 7, "id": "5bc0d977-62f0-4738-96fb-1e833a0b0419", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(mlsam_data_list)" ] }, { "cell_type": "code", "execution_count": 8, "id": "6bf65db9-a2e4-4600-8cd5-68b436c982fa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1259096\n", "After removal of unknown topics\n", "985405\n" ] } ], "source": [ "mlsam_data_combined = pd.concat(mlsam_data_list, axis=0)\n", "print(len(mlsam_data_combined))\n", "mlsam_data_combined = mlsam_data_combined.loc[mlsam_data_combined['topic']!='unknown'].reset_index(drop=True)\n", "mlsam_data_combined = mlsam_data_combined.loc[~mlsam_data_combined['title'].isna()].reset_index(drop=True)\n", "mlsam_data_combined['description'] = mlsam_data_combined['summary'].fillna('').apply(lambda summary: summary[:300])\n", "print(\"After removal of unknown topics\")\n", "print(len(mlsam_data_combined))" ] }, { "cell_type": "code", "execution_count": 9, "id": "17a7c056-2315-496a-b2f3-41feed910ac3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lang\n", "fr 424715\n", "es 290645\n", "de 242982\n", "ru 27063\n", "Name: count, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mlsam_data_combined['lang'].value_counts()" ] }, { "cell_type": "code", "execution_count": 10, "id": "1a7e453c-fddb-4233-a707-b236c56b936a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>text</th>\n", " <th>summary</th>\n", " <th>topic</th>\n", " <th>url</th>\n", " <th>title</th>\n", " <th>date</th>\n", " <th>lang</th>\n", " <th>description</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Un niño de tres años ha fallecido esta noche e...</td>\n", " <td>El niño llegó al hospital sin respiración ni p...</td>\n", " <td>politica actualidad</td>\n", " <td>http://elpais.com/politica/2019/01/01/actualid...</td>\n", " <td>Muere un niño de tres años atragantado con una...</td>\n", " <td>01/01/2019</td>\n", " <td>es</td>\n", " <td>El niño llegó al hospital sin respiración ni p...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>El bikini de flores y tul de Cristina Pedroche...</td>\n", " <td>La presentadora cambió su arriesgado dos pieza...</td>\n", " <td>elpais gente</td>\n", " <td>http://elpais.com/elpais/2019/01/01/gente/1546...</td>\n", " <td>El segundo vestido de Pedroche que quedó eclip...</td>\n", " <td>01/01/2019</td>\n", " <td>es</td>\n", " <td>La presentadora cambió su arriesgado dos pieza...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Una joven británica de 25 años ha resultado he...</td>\n", " <td>La mujer se despeñó en el municipio de Algaida...</td>\n", " <td>politica actualidad</td>\n", " <td>http://elpais.com/politica/2019/01/01/actualid...</td>\n", " <td>Una joven británica resulta herida muy grave a...</td>\n", " <td>01/01/2019</td>\n", " <td>es</td>\n", " <td>La mujer se despeñó en el municipio de Algaida...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Juan Carlos Galindo Con este comentario de nue...</td>\n", " <td>El alemán Christian Thielemann ha sido el enca...</td>\n", " <td>cultura actualidad</td>\n", " <td>http://elpais.com/cultura/2019/01/01/actualida...</td>\n", " <td>Así te hemos contado el Concierto de Año Nuevo...</td>\n", " <td>01/01/2019</td>\n", " <td>es</td>\n", " <td>El alemán Christian Thielemann ha sido el enca...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Agentes de policía detienen al atacante en Man...</td>\n", " <td>Detenido un hombre tras acuchillar a tres pers...</td>\n", " <td>internacional actualidad</td>\n", " <td>http://elpais.com/internacional/2019/01/01/act...</td>\n", " <td>La policía investiga como un ataque terrorista...</td>\n", " <td>01/01/2019</td>\n", " <td>es</td>\n", " <td>Detenido un hombre tras acuchillar a tres pers...</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>985400</th>\n", " <td>— Георгий Евгеньевич, складывается впечатление...</td>\n", " <td>Они чувствуют себя чужими в мире взрослых и тя...</td>\n", " <td>social</td>\n", " <td>https://www.mk.ru/social/2013/03/19/828147-kak...</td>\n", " <td>Как не стать педофилом?</td>\n", " <td>20/03/2013</td>\n", " <td>ru</td>\n", " <td>Они чувствуют себя чужими в мире взрослых и тя...</td>\n", " </tr>\n", " <tr>\n", " <th>985401</th>\n", " <td>Открывая заседание, президент ассоциации Валер...</td>\n", " <td>К такому выводу пришли представители малых и с...</td>\n", " <td>mosobl</td>\n", " <td>https://www.mk.ru/mosobl/2013/03/19/828148-ray...</td>\n", " <td>Райсовет — на обочине</td>\n", " <td>20/03/2013</td>\n", " <td>ru</td>\n", " <td>К такому выводу пришли представители малых и с...</td>\n", " </tr>\n", " <tr>\n", " <th>985402</th>\n", " <td>На улице Марины Цветаевой и на соседней Спарта...</td>\n", " <td>Власти Подмосковья в лице врио губернатора Анд...</td>\n", " <td>mosobl</td>\n", " <td>https://www.mk.ru/mosobl/2013/03/19/828197-po-...</td>\n", " <td>По закону против джунглей</td>\n", " <td>20/03/2013</td>\n", " <td>ru</td>\n", " <td>Власти Подмосковья в лице врио губернатора Анд...</td>\n", " </tr>\n", " <tr>\n", " <th>985403</th>\n", " <td>Что не нравится жителям города? В последние го...</td>\n", " <td>В Клину строительство высотки в исторической ч...</td>\n", " <td>mosobl</td>\n", " <td>https://www.mk.ru/mosobl/2013/03/19/828203-mes...</td>\n", " <td>Местный батюшка получил место в торговых рядах</td>\n", " <td>20/03/2013</td>\n", " <td>ru</td>\n", " <td>В Клину строительство высотки в исторической ч...</td>\n", " </tr>\n", " <tr>\n", " <th>985404</th>\n", " <td>— Во всем цивилизованном мире пригороды вокруг...</td>\n", " <td>Раис Ахмадеев, первый заместитель главы админи...</td>\n", " <td>mosobl</td>\n", " <td>https://www.mk.ru/mosobl/2013/03/19/828136-sta...</td>\n", " <td>Станут ли Химки российским Манхэттеном?</td>\n", " <td>20/03/2013</td>\n", " <td>ru</td>\n", " <td>Раис Ахмадеев, первый заместитель главы админи...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>985405 rows × 8 columns</p>\n", "</div>" ], "text/plain": [ " text \\\n", "0 Un niño de tres años ha fallecido esta noche e... \n", "1 El bikini de flores y tul de Cristina Pedroche... \n", "2 Una joven británica de 25 años ha resultado he... \n", "3 Juan Carlos Galindo Con este comentario de nue... \n", "4 Agentes de policía detienen al atacante en Man... \n", "... ... \n", "985400 — Георгий Евгеньевич, складывается впечатление... \n", "985401 Открывая заседание, президент ассоциации Валер... \n", "985402 На улице Марины Цветаевой и на соседней Спарта... \n", "985403 Что не нравится жителям города? В последние го... \n", "985404 — Во всем цивилизованном мире пригороды вокруг... \n", "\n", " summary \\\n", "0 El niño llegó al hospital sin respiración ni p... \n", "1 La presentadora cambió su arriesgado dos pieza... \n", "2 La mujer se despeñó en el municipio de Algaida... \n", "3 El alemán Christian Thielemann ha sido el enca... \n", "4 Detenido un hombre tras acuchillar a tres pers... \n", "... ... \n", "985400 Они чувствуют себя чужими в мире взрослых и тя... \n", "985401 К такому выводу пришли представители малых и с... \n", "985402 Власти Подмосковья в лице врио губернатора Анд... \n", "985403 В Клину строительство высотки в исторической ч... \n", "985404 Раис Ахмадеев, первый заместитель главы админи... \n", "\n", " topic \\\n", "0 politica actualidad \n", "1 elpais gente \n", "2 politica actualidad \n", "3 cultura actualidad \n", "4 internacional actualidad \n", "... ... \n", "985400 social \n", "985401 mosobl \n", "985402 mosobl \n", "985403 mosobl \n", "985404 mosobl \n", "\n", " url \\\n", "0 http://elpais.com/politica/2019/01/01/actualid... \n", "1 http://elpais.com/elpais/2019/01/01/gente/1546... \n", "2 http://elpais.com/politica/2019/01/01/actualid... \n", "3 http://elpais.com/cultura/2019/01/01/actualida... \n", "4 http://elpais.com/internacional/2019/01/01/act... \n", "... ... \n", "985400 https://www.mk.ru/social/2013/03/19/828147-kak... \n", "985401 https://www.mk.ru/mosobl/2013/03/19/828148-ray... \n", "985402 https://www.mk.ru/mosobl/2013/03/19/828197-po-... \n", "985403 https://www.mk.ru/mosobl/2013/03/19/828203-mes... \n", "985404 https://www.mk.ru/mosobl/2013/03/19/828136-sta... \n", "\n", " title date lang \\\n", "0 Muere un niño de tres años atragantado con una... 01/01/2019 es \n", "1 El segundo vestido de Pedroche que quedó eclip... 01/01/2019 es \n", "2 Una joven británica resulta herida muy grave a... 01/01/2019 es \n", "3 Así te hemos contado el Concierto de Año Nuevo... 01/01/2019 es \n", "4 La policía investiga como un ataque terrorista... 01/01/2019 es \n", "... ... ... ... \n", "985400 Как не стать педофилом? 20/03/2013 ru \n", "985401 Райсовет — на обочине 20/03/2013 ru \n", "985402 По закону против джунглей 20/03/2013 ru \n", "985403 Местный батюшка получил место в торговых рядах 20/03/2013 ru \n", "985404 Станут ли Химки российским Манхэттеном? 20/03/2013 ru \n", "\n", " description \n", "0 El niño llegó al hospital sin respiración ni p... \n", "1 La presentadora cambió su arriesgado dos pieza... \n", "2 La mujer se despeñó en el municipio de Algaida... \n", "3 El alemán Christian Thielemann ha sido el enca... \n", "4 Detenido un hombre tras acuchillar a tres pers... \n", "... ... \n", "985400 Они чувствуют себя чужими в мире взрослых и тя... \n", "985401 К такому выводу пришли представители малых и с... \n", "985402 Власти Подмосковья в лице врио губернатора Анд... \n", "985403 В Клину строительство высотки в исторической ч... \n", "985404 Раис Ахмадеев, первый заместитель главы админи... \n", "\n", "[985405 rows x 8 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mlsam_data_combined" ] }, { "cell_type": "code", "execution_count": 11, "id": "b7ac0687-0f3b-4d9a-90ca-16e70fb97cf2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 985405.000000\n", "mean 150.525854\n", "std 57.657746\n", "min 28.000000\n", "25% 109.000000\n", "50% 143.000000\n", "60% 157.000000\n", "70% 172.000000\n", "80% 192.000000\n", "90% 224.000000\n", "95% 256.000000\n", "98% 299.000000\n", "99% 329.000000\n", "max 1522.000000\n", "Name: summary, dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mlsam_data_combined['summary'].apply(len).describe(percentiles=[.25, .5, .6,.7, .8, .9, .95, .98, .99])" ] }, { "cell_type": "code", "execution_count": 12, "id": "dc29adbc-aec1-4422-acc7-a270555d5a6e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "topic\n", "sport 74601\n", "politik 74492\n", "economie 45957\n", "wirtschaft 40141\n", "idees 27022\n", " ... \n", "reseaux-sociaux 1\n", "israel-palestine 1\n", "remettre-le-confort-au-centre-de-l-innovation 1\n", "petrole 1\n", "voyage-togo 1\n", "Name: count, Length: 862, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mlsam_data_combined['topic'].value_counts()" ] }, { "cell_type": "code", "execution_count": 13, "id": "6bfb5fd8-7f7d-47e7-9498-5cf91f322a98", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lang topic \n", "de politik 74492\n", " sport 53814\n", " wirtschaft 40141\n", " panorama 22922\n", " digital 9811\n", " geld 8359\n", " reise 7510\n", " auto 6742\n", " karriere 6613\n", " muenchen 5802\n", " stil 3118\n", " bildung 2754\n", " service 819\n", " reisefuehrer 49\n", " thema 25\n", " app 6\n", " 1.2527726?utm_source=FlexiTM 1\n", " 1.2699904?SRCmuc=FlexiTM 1\n", " kultur 1\n", " mahjong 1\n", " tiananmen 1\n", "es elpais opinion 25897\n", " cultura actualidad 23009\n", " ccaa catalunya 21663\n", " politica actualidad 21579\n", " economia actualidad 20660\n", " internacional actualidad 18410\n", " deportes actualidad 15336\n", " ccaa madrid 14346\n", " ccaa valencia 8341\n", " cultura television 6999\n", " sociedad actualidad 6184\n", " elpais gente 5616\n", " elviajero actualidad 5196\n", " ccaa paisvasco 4590\n", " ccaa galicia 4562\n", " elpais planeta_futuro 3869\n", " elpais estilo 3816\n", " ccaa andalucia 3691\n", " diario deportes 3306\n", " elpais eps 3255\n", " elpais inenglish 3203\n", " cultura babelia 2952\n", " elpais album 2799\n", " diario internacional 2627\n", " diario espana 2619\n", " elpais videos 2544\n", " diario andalucia 2339\n", " elpais ciencia 2316\n", " diario paisvasco 2292\n", " diario madrid 2291\n", " tecnologia actualidad 2263\n", " elpais actualidad 2250\n", " diario galicia 2167\n", " diario cvalenciana 2070\n", " internacional america 2013\n", " diario catalunya 1973\n", " internacional mexico 1969\n", " elpais icon 1769\n", " diario sociedad 1585\n", "Name: count, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mlsam_data_combined.groupby('lang')['topic'].value_counts()[:60]" ] }, { "cell_type": "code", "execution_count": 14, "id": "013e4502-86e1-4204-9354-a63980393f37", "metadata": {}, "outputs": [], "source": [ "EN_MSMARCO_DATA_PATH = \"/Users/cgopal/Downloads/web_data/msmarco_english.csv\"" ] }, { "cell_type": "code", "execution_count": 15, "id": "3209ca59-269f-439a-9f0a-4ac88515654a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "495757\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>docid</th>\n", " <th>url</th>\n", " <th>title</th>\n", " <th>body</th>\n", " <th>lang</th>\n", " <th>topic</th>\n", " <th>description</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>D1555982</td>\n", " <td>https://answers.yahoo.com/question/index?qid=2...</td>\n", " <td>The hot glowing surfaces of stars emit energy ...</td>\n", " <td>Science &amp; Mathematics Physics The hot glowing ...</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>Science &amp; Mathematics Physics The hot glowing ...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>D301595</td>\n", " <td>http://childparenting.about.com/od/physicalemo...</td>\n", " <td>Developmental Milestones and Your 8-Year-Old C...</td>\n", " <td>School-Age Kids Growth &amp; Development Developme...</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>School-Age Kids Growth &amp; Development Developme...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>D1359209</td>\n", " <td>http://visihow.com/Check_for_Lice_Nits</td>\n", " <td>Check for Lice Nits</td>\n", " <td>Check for Lice Nits Edited by Mian Sheilette O...</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>Check for Lice Nits Edited by Mian Sheilette O...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D2147834</td>\n", " <td>http://www.nytimes.com/2010/01/05/business/glo...</td>\n", " <td>Dubai Opens a Tower to Beat All</td>\n", " <td>Global Business Dubai Opens a Tower to Beat Al...</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>Global Business Dubai Opens a Tower to Beat Al...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>D1568809</td>\n", " <td>http://www.realtor.com/realestateandhomes-sear...</td>\n", " <td>Coulterville, CA Real Estate &amp; Homes for Sale</td>\n", " <td>Coulterville, CA Real Estate &amp; Homes for Sale4...</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>Coulterville, CA Real Estate &amp; Homes for Sale4...</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>495752</th>\n", " <td>D481294</td>\n", " <td>http://biology.about.com/od/organsystems/ss/ly...</td>\n", " <td>Lymphatic System Components</td>\n", " <td>Science, Tech, Math ›Science Lymphatic System ...</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>Science, Tech, Math ›Science Lymphatic System ...</td>\n", " </tr>\n", " <tr>\n", " <th>495753</th>\n", " <td>D2920233</td>\n", " <td>http://www.freshplaza.com/article/154127/Pomeg...</td>\n", " <td>Pomegranates: increasing production, consumpti...</td>\n", " <td>Pomegranates: increasing production, consumpti...</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>Pomegranates: increasing production, consumpti...</td>\n", " </tr>\n", " <tr>\n", " <th>495754</th>\n", " <td>D1616157</td>\n", " <td>http://www.airpassengerrights.eu/en/denied-boa...</td>\n", " <td>Denied boarding</td>\n", " <td>Denied boarding3 1 1 1 1 1 1 1 1 1 1 Rating 3....</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>Denied boarding3 1 1 1 1 1 1 1 1 1 1 Rating 3....</td>\n", " </tr>\n", " <tr>\n", " <th>495755</th>\n", " <td>D1363414</td>\n", " <td>http://www.augustaga.gov/657/Solid-Waste</td>\n", " <td>Environmental Services</td>\n", " <td>Home Departments A - E Environmental Services ...</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>Home Departments A - E Environmental Services ...</td>\n", " </tr>\n", " <tr>\n", " <th>495756</th>\n", " <td>D2273143</td>\n", " <td>https://www.allaboutcircuits.com/textbook/refe...</td>\n", " <td>Solving Simultaneous Equations</td>\n", " <td>Solving Simultaneous Equations Chapter 4 - Alg...</td>\n", " <td>en</td>\n", " <td></td>\n", " <td>Solving Simultaneous Equations Chapter 4 - Alg...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>495757 rows × 7 columns</p>\n", "</div>" ], "text/plain": [ " docid url \\\n", "0 D1555982 https://answers.yahoo.com/question/index?qid=2... \n", "1 D301595 http://childparenting.about.com/od/physicalemo... \n", "2 D1359209 http://visihow.com/Check_for_Lice_Nits \n", "3 D2147834 http://www.nytimes.com/2010/01/05/business/glo... \n", "4 D1568809 http://www.realtor.com/realestateandhomes-sear... \n", "... ... ... \n", "495752 D481294 http://biology.about.com/od/organsystems/ss/ly... \n", "495753 D2920233 http://www.freshplaza.com/article/154127/Pomeg... \n", "495754 D1616157 http://www.airpassengerrights.eu/en/denied-boa... \n", "495755 D1363414 http://www.augustaga.gov/657/Solid-Waste \n", "495756 D2273143 https://www.allaboutcircuits.com/textbook/refe... \n", "\n", " title \\\n", "0 The hot glowing surfaces of stars emit energy ... \n", "1 Developmental Milestones and Your 8-Year-Old C... \n", "2 Check for Lice Nits \n", "3 Dubai Opens a Tower to Beat All \n", "4 Coulterville, CA Real Estate & Homes for Sale \n", "... ... \n", "495752 Lymphatic System Components \n", "495753 Pomegranates: increasing production, consumpti... \n", "495754 Denied boarding \n", "495755 Environmental Services \n", "495756 Solving Simultaneous Equations \n", "\n", " body lang topic \\\n", "0 Science & Mathematics Physics The hot glowing ... en \n", "1 School-Age Kids Growth & Development Developme... en \n", "2 Check for Lice Nits Edited by Mian Sheilette O... en \n", "3 Global Business Dubai Opens a Tower to Beat Al... en \n", "4 Coulterville, CA Real Estate & Homes for Sale4... en \n", "... ... ... ... \n", "495752 Science, Tech, Math ›Science Lymphatic System ... en \n", "495753 Pomegranates: increasing production, consumpti... en \n", "495754 Denied boarding3 1 1 1 1 1 1 1 1 1 1 Rating 3.... en \n", "495755 Home Departments A - E Environmental Services ... en \n", "495756 Solving Simultaneous Equations Chapter 4 - Alg... en \n", "\n", " description \n", "0 Science & Mathematics Physics The hot glowing ... \n", "1 School-Age Kids Growth & Development Developme... \n", "2 Check for Lice Nits Edited by Mian Sheilette O... \n", "3 Global Business Dubai Opens a Tower to Beat Al... \n", "4 Coulterville, CA Real Estate & Homes for Sale4... \n", "... ... \n", "495752 Science, Tech, Math ›Science Lymphatic System ... \n", "495753 Pomegranates: increasing production, consumpti... \n", "495754 Denied boarding3 1 1 1 1 1 1 1 1 1 1 Rating 3.... \n", "495755 Home Departments A - E Environmental Services ... \n", "495756 Solving Simultaneous Equations Chapter 4 - Alg... \n", "\n", "[495757 rows x 7 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "english_data = pd.read_csv(EN_MSMARCO_DATA_PATH, header=None)\n", "english_data.columns = ['docid', 'url', 'title', 'body']\n", "english_data['lang'] = 'en'\n", "english_data['topic'] = ''\n", "english_data = english_data.loc[~english_data['title'].isna()].reset_index(drop=True)\n", "english_data['description'] = english_data['body'].fillna('').apply(lambda body: body[:300])\n", "print(len(english_data))\n", "english_data" ] }, { "cell_type": "code", "execution_count": 16, "id": "91dedb58-b11a-4949-8d0d-8c012d7d7242", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1423693\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>title</th>\n", " <th>description</th>\n", " <th>topic</th>\n", " <th>lang</th>\n", " <th>domain</th>\n", " <th>combined_text</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>http://elpais.com/politica/2019/01/01/actualid...</td>\n", " <td>Muere un niño de tres años atragantado con una...</td>\n", " <td>El niño llegó al hospital sin respiración ni p...</td>\n", " <td>politica actualidad</td>\n", " <td>es</td>\n", " <td>elpais.com</td>\n", " <td>Muere un niño de tres años atragantado con una...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>http://elpais.com/elpais/2019/01/01/gente/1546...</td>\n", " <td>El segundo vestido de Pedroche que quedó eclip...</td>\n", " <td>La presentadora cambió su arriesgado dos pieza...</td>\n", " <td>elpais gente</td>\n", " <td>es</td>\n", " <td>elpais.com</td>\n", " <td>El segundo vestido de Pedroche que quedó eclip...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>http://elpais.com/politica/2019/01/01/actualid...</td>\n", " <td>Una joven británica resulta herida muy grave a...</td>\n", " <td>La mujer se despeñó en el municipio de Algaida...</td>\n", " <td>politica actualidad</td>\n", " <td>es</td>\n", " <td>elpais.com</td>\n", " <td>Una joven británica resulta herida muy grave a...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>http://elpais.com/cultura/2019/01/01/actualida...</td>\n", " <td>Así te hemos contado el Concierto de Año Nuevo...</td>\n", " <td>El alemán Christian Thielemann ha sido el enca...</td>\n", " <td>cultura actualidad</td>\n", " <td>es</td>\n", " <td>elpais.com</td>\n", " <td>Así te hemos contado el Concierto de Año Nuevo...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>http://elpais.com/internacional/2019/01/01/act...</td>\n", " <td>La policía investiga como un ataque terrorista...</td>\n", " <td>Detenido un hombre tras acuchillar a tres pers...</td>\n", " <td>internacional actualidad</td>\n", " <td>es</td>\n", " <td>elpais.com</td>\n", " <td>La policía investiga como un ataque terrorista...</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>1423688</th>\n", " <td>http://biology.about.com/od/organsystems/ss/ly...</td>\n", " <td>Lymphatic System Components</td>\n", " <td>Science, Tech, Math ›Science Lymphatic System ...</td>\n", " <td></td>\n", " <td>en</td>\n", " <td>biology.about.com</td>\n", " <td>Lymphatic System Components Science, Tech, Mat...</td>\n", " </tr>\n", " <tr>\n", " <th>1423689</th>\n", " <td>http://www.freshplaza.com/article/154127/Pomeg...</td>\n", " <td>Pomegranates: increasing production, consumpti...</td>\n", " <td>Pomegranates: increasing production, consumpti...</td>\n", " <td></td>\n", " <td>en</td>\n", " <td>www.freshplaza.com</td>\n", " <td>Pomegranates: increasing production, consumpti...</td>\n", " </tr>\n", " <tr>\n", " <th>1423690</th>\n", " <td>http://www.airpassengerrights.eu/en/denied-boa...</td>\n", " <td>Denied boarding</td>\n", " <td>Denied boarding3 1 1 1 1 1 1 1 1 1 1 Rating 3....</td>\n", " <td></td>\n", " <td>en</td>\n", " <td>www.airpassengerrights.eu</td>\n", " <td>Denied boarding Denied boarding3 1 1 1 1 1 1 1...</td>\n", " </tr>\n", " <tr>\n", " <th>1423691</th>\n", " <td>http://www.augustaga.gov/657/Solid-Waste</td>\n", " <td>Environmental Services</td>\n", " <td>Home Departments A - E Environmental Services ...</td>\n", " <td></td>\n", " <td>en</td>\n", " <td>www.augustaga.gov</td>\n", " <td>Environmental Services Home Departments A - E ...</td>\n", " </tr>\n", " <tr>\n", " <th>1423692</th>\n", " <td>https://www.allaboutcircuits.com/textbook/refe...</td>\n", " <td>Solving Simultaneous Equations</td>\n", " <td>Solving Simultaneous Equations Chapter 4 - Alg...</td>\n", " <td></td>\n", " <td>en</td>\n", " <td>www.allaboutcircuits.com</td>\n", " <td>Solving Simultaneous Equations Solving Simulta...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>1423693 rows × 7 columns</p>\n", "</div>" ], "text/plain": [ " url \\\n", "0 http://elpais.com/politica/2019/01/01/actualid... \n", "1 http://elpais.com/elpais/2019/01/01/gente/1546... \n", "2 http://elpais.com/politica/2019/01/01/actualid... \n", "3 http://elpais.com/cultura/2019/01/01/actualida... \n", "4 http://elpais.com/internacional/2019/01/01/act... \n", "... ... \n", "1423688 http://biology.about.com/od/organsystems/ss/ly... \n", "1423689 http://www.freshplaza.com/article/154127/Pomeg... \n", "1423690 http://www.airpassengerrights.eu/en/denied-boa... \n", "1423691 http://www.augustaga.gov/657/Solid-Waste \n", "1423692 https://www.allaboutcircuits.com/textbook/refe... \n", "\n", " title \\\n", "0 Muere un niño de tres años atragantado con una... \n", "1 El segundo vestido de Pedroche que quedó eclip... \n", "2 Una joven británica resulta herida muy grave a... \n", "3 Así te hemos contado el Concierto de Año Nuevo... \n", "4 La policía investiga como un ataque terrorista... \n", "... ... \n", "1423688 Lymphatic System Components \n", "1423689 Pomegranates: increasing production, consumpti... \n", "1423690 Denied boarding \n", "1423691 Environmental Services \n", "1423692 Solving Simultaneous Equations \n", "\n", " description \\\n", "0 El niño llegó al hospital sin respiración ni p... \n", "1 La presentadora cambió su arriesgado dos pieza... \n", "2 La mujer se despeñó en el municipio de Algaida... \n", "3 El alemán Christian Thielemann ha sido el enca... \n", "4 Detenido un hombre tras acuchillar a tres pers... \n", "... ... \n", "1423688 Science, Tech, Math ›Science Lymphatic System ... \n", "1423689 Pomegranates: increasing production, consumpti... \n", "1423690 Denied boarding3 1 1 1 1 1 1 1 1 1 1 Rating 3.... \n", "1423691 Home Departments A - E Environmental Services ... \n", "1423692 Solving Simultaneous Equations Chapter 4 - Alg... \n", "\n", " topic lang domain \\\n", "0 politica actualidad es elpais.com \n", "1 elpais gente es elpais.com \n", "2 politica actualidad es elpais.com \n", "3 cultura actualidad es elpais.com \n", "4 internacional actualidad es elpais.com \n", "... ... ... ... \n", "1423688 en biology.about.com \n", "1423689 en www.freshplaza.com \n", "1423690 en www.airpassengerrights.eu \n", "1423691 en www.augustaga.gov \n", "1423692 en www.allaboutcircuits.com \n", "\n", " combined_text \n", "0 Muere un niño de tres años atragantado con una... \n", "1 El segundo vestido de Pedroche que quedó eclip... \n", "2 Una joven británica resulta herida muy grave a... \n", "3 Así te hemos contado el Concierto de Año Nuevo... \n", "4 La policía investiga como un ataque terrorista... \n", "... ... \n", "1423688 Lymphatic System Components Science, Tech, Mat... \n", "1423689 Pomegranates: increasing production, consumpti... \n", "1423690 Denied boarding Denied boarding3 1 1 1 1 1 1 1... \n", "1423691 Environmental Services Home Departments A - E ... \n", "1423692 Solving Simultaneous Equations Solving Simulta... \n", "\n", "[1423693 rows x 7 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined_data = pd.concat([\n", " mlsam_data_combined[['url', 'title', 'description', 'topic', 'lang']],\n", " english_data[['url', 'title', 'description', 'topic', 'lang']]\n", "], axis=0).reset_index(drop=True)\n", "\n", "combined_data = combined_data.loc[combined_data['title'].apply(lambda title: len(title) > 5 and len(title) < 200)].reset_index(drop=True)\n", "\n", "combined_data['domain'] = combined_data['url'].apply(lambda x: urlparse(x).netloc.split(':')[0])\n", "combined_data['combined_text'] = combined_data['title'] + \" \" + combined_data['description'].fillna(\"\")\n", "print(len(combined_data))\n", "combined_data" ] }, { "cell_type": "code", "execution_count": 17, "id": "90afa11a-2674-4413-b948-c9e2409fe9ed", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "lang\n", "en 35000\n", "fr 5000\n", "es 5000\n", "de 5000\n", "ru 5000\n", "Name: count, dtype: int64\n" ] } ], "source": [ "# sample_data = combined_data.sample(200000)\n", "# print(len(sample_data))\n", "\n", "\n", "# sampling logic\n", "\n", "sample_size = 50000\n", "language_distribution = {\n", " 'en': 0.7 * sample_size, # 70% English\n", " 'fr': 0.1 * sample_size, # 10% French\n", " 'es': 0.1 * sample_size, # 10% Spanish\n", " 'de': 0.1 * sample_size, # 10% German\n", " 'ru': 0.1 * sample_size # 10% Russian\n", "}\n", "\n", "# Sample data for each language\n", "sampled_dfs = []\n", "for lang, size in language_distribution.items():\n", " sampled_dfs.append(combined_data[combined_data['lang'] == lang].sample(n=int(size), random_state=42))\n", "\n", "# Combine the sampled data\n", "sampled_df = pd.concat(sampled_dfs, axis=0).reset_index(drop=True)\n", "\n", "# Display the result\n", "print(sampled_df['lang'].value_counts())" ] }, { "cell_type": "code", "execution_count": 18, "id": "b9a171e6-a548-4501-a1c1-57902bd74f13", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lang\n", "en 438426\n", "fr 424630\n", "es 290610\n", "de 242978\n", "ru 27049\n", "Name: count, dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined_data['lang'].value_counts()" ] }, { "cell_type": "code", "execution_count": 19, "id": "1dd631f2-0ac5-416d-aaf4-80e995a478af", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lang\n", "en 0.079831\n", "fr 0.011775\n", "es 0.017205\n", "de 0.020578\n", "ru 0.184850\n", "Name: count, dtype: float64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sampled_df['lang'].value_counts() / combined_data['lang'].value_counts()" ] }, { "cell_type": "code", "execution_count": 20, "id": "6147c3da-29b3-45d2-bd59-93567a5eeed9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "selected model is Xenova/all-MiniLM-L6-v2\n", "selected model is Xenova/paraphrase-multilingual-MiniLM-L12-v2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/5 [00:00<?, ?it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "preparing for lang = en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 20%|██████████████████▌ | 1/5 [14:06<56:26, 846.66s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "preparing for lang = fr\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 40%|█████████████████████████████████████▏ | 2/5 [15:47<20:24, 408.15s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "preparing for lang = es\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 60%|███████████████████████████████████████████████████████▊ | 3/5 [17:07<08:36, 258.17s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "preparing for lang = de\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 80%|██████████████████████████████████████████████████████████████████████████▍ | 4/5 [18:41<03:13, 193.37s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "preparing for lang = ru\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "00%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [19:59<00:00, 239.89s/it]" ] } ], "source": [ "\n", "sample_data_by_lang = {}\n", "sample_embeddings_by_lang = {}\n", "fe_en = FeatureExtractor(EMBEDDING_MODELS_DICT, model_name=\"Xenova/all-MiniLM-L6-v2\")\n", "fe_multi = FeatureExtractor(EMBEDDING_MODELS_DICT, model_name=\"Xenova/paraphrase-multilingual-MiniLM-L12-v2\")\n", "for lang in tqdm(sampled_df['lang'].unique()):\n", " print(f\"preparing for lang = {lang}\")\n", " sample_data_by_lang[lang] = sampled_df[sampled_df['lang'] == lang].reset_index(drop=True)\n", " if lang == \"en\":\n", " sample_embeddings_by_lang[lang] = fe_en.get_embeddings(sample_data_by_lang[lang]['combined_text'].values.tolist())\n", " else:\n", " sample_embeddings_by_lang[lang] = fe_multi.get_embeddings(sample_data_by_lang[lang]['combined_text'].values.tolist())" ] }, { "cell_type": "code", "execution_count": 21, "id": "5dc2e220-d3b3-4bbb-a57b-6b859b638e30", "metadata": {}, "outputs": [], "source": [ "with open(\"../data/sample_data_by_lang.pkl\", \"wb\") as f:\n", " pickle.dump(sample_data_by_lang, f)\n", "\n", "with open(\"../data/sample_embeddings_by_lang.pkl\", \"wb\") as f:\n", " pickle.dump(sample_embeddings_by_lang, f)" ] }, { "cell_type": "code", "execution_count": null, "id": "88209318-63ee-47c7-94a8-bd7162ea21f9", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 5 }