notebooks/user_profiles_generation.ipynb (1,184 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "19cc239a-7ab1-46bd-88d9-706a6e866565",
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"import sqlite3\n",
"import pandas as pd\n",
"import os\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ccd0957e-49a8-4d43-bfe3-6cd5b5d9aac9",
"metadata": {},
"outputs": [],
"source": [
"with open(\"../data/sample_data_by_lang_updated.pkl\", \"rb\") as f:\n",
" sample_data_by_lang_updated = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d0392fc9-886a-459b-a22d-5c68916aab34",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(sample_data_by_lang_updated)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "6a149111-cd7f-4c79-9b10-281073b03384",
"metadata": {},
"outputs": [],
"source": [
"sample_data_by_lang_updated['en'].to_csv(\"../data/en_search_web_history_data.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b2b10a9e-7010-47a6-a8d5-4f07aaac51f3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>title</th>\n",
" <th>description</th>\n",
" <th>topic</th>\n",
" <th>lang</th>\n",
" <th>domain</th>\n",
" <th>combined_text</th>\n",
" <th>themes</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>http://wordstormcasserole.com/boombycindyjoseph/</td>\n",
" <td>Review: BOOM! By Cindy Joseph Makeup</td>\n",
" <td>Review: BOOM! By Cindy Joseph Makeup November ...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>wordstormcasserole.com</td>\n",
" <td>Review: BOOM! By Cindy Joseph Makeup Review: B...</td>\n",
" <td>[Community and Society, Lifestyle, Arts & Ente...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>http://rhodesokla.com/2013/09/major-oklahoma-t...</td>\n",
" <td>Major Oklahoma Tort Reform Laws Restored</td>\n",
" <td>Major Oklahoma Tort Reform Laws Restored Septe...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>rhodesokla.com</td>\n",
" <td>Major Oklahoma Tort Reform Laws Restored Major...</td>\n",
" <td>[Law and Government]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>https://www.theweathernetwork.com/ca/weather/o...</td>\n",
" <td>Vaughan, ON</td>\n",
" <td>Vaughan, ON Would you like to add this locatio...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.theweathernetwork.com</td>\n",
" <td>Vaughan, ON Vaughan, ON Would you like to add ...</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>http://www.city-data.com/city/Vernon-Texas.html</td>\n",
" <td>Vernon, Texas</td>\n",
" <td>Follow city-data.com founder on our Forum or@L...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.city-data.com</td>\n",
" <td>Vernon, Texas Follow city-data.com founder on ...</td>\n",
" <td>[Community and Society, Health]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>http://forums.moneysavingexpert.com/showthread...</td>\n",
" <td>Welcometo theMSE Forums</td>\n",
" <td>Welcometo the MSE Forums We're home to a fanta...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>forums.moneysavingexpert.com</td>\n",
" <td>Welcometo theMSE Forums Welcometo the MSE Foru...</td>\n",
" <td>[Community and Society, Finance, Law and Gover...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"0 http://wordstormcasserole.com/boombycindyjoseph/ \n",
"1 http://rhodesokla.com/2013/09/major-oklahoma-t... \n",
"2 https://www.theweathernetwork.com/ca/weather/o... \n",
"3 http://www.city-data.com/city/Vernon-Texas.html \n",
"4 http://forums.moneysavingexpert.com/showthread... \n",
"\n",
" title \\\n",
"0 Review: BOOM! By Cindy Joseph Makeup \n",
"1 Major Oklahoma Tort Reform Laws Restored \n",
"2 Vaughan, ON \n",
"3 Vernon, Texas \n",
"4 Welcometo theMSE Forums \n",
"\n",
" description topic lang \\\n",
"0 Review: BOOM! By Cindy Joseph Makeup November ... en \n",
"1 Major Oklahoma Tort Reform Laws Restored Septe... en \n",
"2 Vaughan, ON Would you like to add this locatio... en \n",
"3 Follow city-data.com founder on our Forum or@L... en \n",
"4 Welcometo the MSE Forums We're home to a fanta... en \n",
"\n",
" domain \\\n",
"0 wordstormcasserole.com \n",
"1 rhodesokla.com \n",
"2 www.theweathernetwork.com \n",
"3 www.city-data.com \n",
"4 forums.moneysavingexpert.com \n",
"\n",
" combined_text \\\n",
"0 Review: BOOM! By Cindy Joseph Makeup Review: B... \n",
"1 Major Oklahoma Tort Reform Laws Restored Major... \n",
"2 Vaughan, ON Vaughan, ON Would you like to add ... \n",
"3 Vernon, Texas Follow city-data.com founder on ... \n",
"4 Welcometo theMSE Forums Welcometo the MSE Foru... \n",
"\n",
" themes \n",
"0 [Community and Society, Lifestyle, Arts & Ente... \n",
"1 [Law and Government] \n",
"2 [] \n",
"3 [Community and Society, Health] \n",
"4 [Community and Society, Finance, Law and Gover... "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_data_by_lang_updated['en'].head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2c8a25dc-8761-44b6-80ab-9533f4a8b1a6",
"metadata": {},
"outputs": [],
"source": [
"with open(\"../data/profiles_generated.pkl\", \"rb\") as f:\n",
" profiles_generated = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "1ebd7f55-5e72-4260-b92d-bcba7dd8897f",
"metadata": {},
"outputs": [],
"source": [
"with open(\"../data/user_profile_queries.pkl\", \"rb\") as f:\n",
" user_profile_queries = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "41089a0b-6f64-4fed-a500-b6ab7aae2b50",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['profile_name', 'categories', 'langs', 'desc', 'df'])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"profiles_generated[2].keys()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "2b7afa27-a3bd-4c42-a8de-b92c00639312",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Creative Hobbyist'"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"profiles_generated[2]['profile_name']"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "32290eb0-6792-4b46-abcc-628cd9beab57",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Hobbies and Leisure', 'Arts & Entertainment', 'Lifestyle', 'Home and Garden']"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"profiles_generated[2]['categories']"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "10115f32-2e01-4db1-b346-280c510497b6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>title</th>\n",
" <th>description</th>\n",
" <th>topic</th>\n",
" <th>lang</th>\n",
" <th>domain</th>\n",
" <th>combined_text</th>\n",
" <th>themes</th>\n",
" <th>frecency</th>\n",
" <th>url_hash</th>\n",
" <th>last_visit_date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>https://www.lifewire.com/extreme-winter-photog...</td>\n",
" <td>Extreme Winter Photography</td>\n",
" <td>How To ›Digital Cameras Extreme Winter Photogr...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.lifewire.com</td>\n",
" <td>Extreme Winter Photography How To ›Digital Cam...</td>\n",
" <td>[Computers Electronics and Technology, Arts & ...</td>\n",
" <td>962</td>\n",
" <td>97419828244965</td>\n",
" <td>1738272722935952</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>http://www.just-health.net/How-Much-Chia-Seed-...</td>\n",
" <td>How Many Chia Seeds Can You Have per Day?</td>\n",
" <td>How Many Chia Seeds Can You Have per Day? Chia...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.just-health.net</td>\n",
" <td>How Many Chia Seeds Can You Have per Day? How ...</td>\n",
" <td>[Food and Drink, Lifestyle, Health]</td>\n",
" <td>3643</td>\n",
" <td>57133353541317</td>\n",
" <td>1738272722937953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://www.black-mold-guide.com/stachybotrys-m...</td>\n",
" <td>Stachybotrys Mold</td>\n",
" <td>Stachybotrys Mold Stachybotrys mold can be ver...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.black-mold-guide.com</td>\n",
" <td>Stachybotrys Mold Stachybotrys Mold Stachybotr...</td>\n",
" <td>[Home and Garden, Health]</td>\n",
" <td>2035</td>\n",
" <td>37815498102075</td>\n",
" <td>1738272722939953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>http://purpleheart.forumotion.com/t512-the-dau...</td>\n",
" <td>The Daughter of Wolverine (a TMNT and X-Men fa...</td>\n",
" <td>thexblackxmaskedxninja Number of posts : 3893 ...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>purpleheart.forumotion.com</td>\n",
" <td>The Daughter of Wolverine (a TMNT and X-Men fa...</td>\n",
" <td>[Arts & Entertainment]</td>\n",
" <td>3933</td>\n",
" <td>14159477585427</td>\n",
" <td>1738272722941953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>https://www.last.fm/music/Baton+Rouge</td>\n",
" <td>Baton Rouge</td>\n",
" <td>Last.fm Search Live Music Events Features Join...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.last.fm</td>\n",
" <td>Baton Rouge Last.fm Search Live Music Events F...</td>\n",
" <td>[Arts & Entertainment]</td>\n",
" <td>3221</td>\n",
" <td>21651773862014</td>\n",
" <td>1738272722943953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6247</th>\n",
" <td>https://www.timeout.com/newyork/style-design/t...</td>\n",
" <td>Find the best garden store in NYC</td>\n",
" <td>Photograph: Courtesy Verni's Brightening up cr...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.timeout.com</td>\n",
" <td>Find the best garden store in NYC Photograph: ...</td>\n",
" <td>[Lifestyle, Business and Consumer Services, Ho...</td>\n",
" <td>4729</td>\n",
" <td>90212593791692</td>\n",
" <td>1738272735430826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6248</th>\n",
" <td>http://www.infobarrel.com/How_To_Get_Rid_of_Ho...</td>\n",
" <td>How To Get Rid of House Flies</td>\n",
" <td>Info Barrel > Home & Garden > Gardening & Yard...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.infobarrel.com</td>\n",
" <td>How To Get Rid of House Flies Info Barrel > Ho...</td>\n",
" <td>[Health, Home and Garden, Reference Materials]</td>\n",
" <td>3607</td>\n",
" <td>98111215616349</td>\n",
" <td>1738272735432826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6249</th>\n",
" <td>http://www.sbs.com.au/popasia/blog/2014/07/21/...</td>\n",
" <td>G-Dragon says he âwill get married soonâ on En...</td>\n",
" <td>Previous Next Show Grid Previous Next Hide Gri...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.sbs.com.au</td>\n",
" <td>G-Dragon says he âwill get married soonâ on En...</td>\n",
" <td>[Arts & Entertainment, News & Media Publishers]</td>\n",
" <td>164</td>\n",
" <td>40500234243399</td>\n",
" <td>1738272735434826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6250</th>\n",
" <td>http://www.definitions.net/definition/pond</td>\n",
" <td>Definitions &Translations</td>\n",
" <td>Princeton's Word Net (0.00 / 0 votes)Rate this...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>www.definitions.net</td>\n",
" <td>Definitions &Translations Princeton's Word Net...</td>\n",
" <td>[Hobbies and Leisure, Reference Materials]</td>\n",
" <td>2985</td>\n",
" <td>25425027875974</td>\n",
" <td>1738272735436827</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6251</th>\n",
" <td>http://thehelpfulartteacher.blogspot.com/2013/...</td>\n",
" <td>LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE</td>\n",
" <td>LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE...</td>\n",
" <td></td>\n",
" <td>en</td>\n",
" <td>thehelpfulartteacher.blogspot.com</td>\n",
" <td>LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE...</td>\n",
" <td>[Arts & Entertainment]</td>\n",
" <td>3921</td>\n",
" <td>53741128296830</td>\n",
" <td>1738272735438827</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6252 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"0 https://www.lifewire.com/extreme-winter-photog... \n",
"1 http://www.just-health.net/How-Much-Chia-Seed-... \n",
"2 http://www.black-mold-guide.com/stachybotrys-m... \n",
"3 http://purpleheart.forumotion.com/t512-the-dau... \n",
"4 https://www.last.fm/music/Baton+Rouge \n",
"... ... \n",
"6247 https://www.timeout.com/newyork/style-design/t... \n",
"6248 http://www.infobarrel.com/How_To_Get_Rid_of_Ho... \n",
"6249 http://www.sbs.com.au/popasia/blog/2014/07/21/... \n",
"6250 http://www.definitions.net/definition/pond \n",
"6251 http://thehelpfulartteacher.blogspot.com/2013/... \n",
"\n",
" title \\\n",
"0 Extreme Winter Photography \n",
"1 How Many Chia Seeds Can You Have per Day? \n",
"2 Stachybotrys Mold \n",
"3 The Daughter of Wolverine (a TMNT and X-Men fa... \n",
"4 Baton Rouge \n",
"... ... \n",
"6247 Find the best garden store in NYC \n",
"6248 How To Get Rid of House Flies \n",
"6249 G-Dragon says he âwill get married soonâ on En... \n",
"6250 Definitions &Translations \n",
"6251 LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE \n",
"\n",
" description topic lang \\\n",
"0 How To ›Digital Cameras Extreme Winter Photogr... en \n",
"1 How Many Chia Seeds Can You Have per Day? Chia... en \n",
"2 Stachybotrys Mold Stachybotrys mold can be ver... en \n",
"3 thexblackxmaskedxninja Number of posts : 3893 ... en \n",
"4 Last.fm Search Live Music Events Features Join... en \n",
"... ... ... ... \n",
"6247 Photograph: Courtesy Verni's Brightening up cr... en \n",
"6248 Info Barrel > Home & Garden > Gardening & Yard... en \n",
"6249 Previous Next Show Grid Previous Next Hide Gri... en \n",
"6250 Princeton's Word Net (0.00 / 0 votes)Rate this... en \n",
"6251 LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE... en \n",
"\n",
" domain \\\n",
"0 www.lifewire.com \n",
"1 www.just-health.net \n",
"2 www.black-mold-guide.com \n",
"3 purpleheart.forumotion.com \n",
"4 www.last.fm \n",
"... ... \n",
"6247 www.timeout.com \n",
"6248 www.infobarrel.com \n",
"6249 www.sbs.com.au \n",
"6250 www.definitions.net \n",
"6251 thehelpfulartteacher.blogspot.com \n",
"\n",
" combined_text \\\n",
"0 Extreme Winter Photography How To ›Digital Cam... \n",
"1 How Many Chia Seeds Can You Have per Day? How ... \n",
"2 Stachybotrys Mold Stachybotrys Mold Stachybotr... \n",
"3 The Daughter of Wolverine (a TMNT and X-Men fa... \n",
"4 Baton Rouge Last.fm Search Live Music Events F... \n",
"... ... \n",
"6247 Find the best garden store in NYC Photograph: ... \n",
"6248 How To Get Rid of House Flies Info Barrel > Ho... \n",
"6249 G-Dragon says he âwill get married soonâ on En... \n",
"6250 Definitions &Translations Princeton's Word Net... \n",
"6251 LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE... \n",
"\n",
" themes frecency \\\n",
"0 [Computers Electronics and Technology, Arts & ... 962 \n",
"1 [Food and Drink, Lifestyle, Health] 3643 \n",
"2 [Home and Garden, Health] 2035 \n",
"3 [Arts & Entertainment] 3933 \n",
"4 [Arts & Entertainment] 3221 \n",
"... ... ... \n",
"6247 [Lifestyle, Business and Consumer Services, Ho... 4729 \n",
"6248 [Health, Home and Garden, Reference Materials] 3607 \n",
"6249 [Arts & Entertainment, News & Media Publishers] 164 \n",
"6250 [Hobbies and Leisure, Reference Materials] 2985 \n",
"6251 [Arts & Entertainment] 3921 \n",
"\n",
" url_hash last_visit_date \n",
"0 97419828244965 1738272722935952 \n",
"1 57133353541317 1738272722937953 \n",
"2 37815498102075 1738272722939953 \n",
"3 14159477585427 1738272722941953 \n",
"4 21651773862014 1738272722943953 \n",
"... ... ... \n",
"6247 90212593791692 1738272735430826 \n",
"6248 98111215616349 1738272735432826 \n",
"6249 40500234243399 1738272735434826 \n",
"6250 25425027875974 1738272735436827 \n",
"6251 53741128296830 1738272735438827 \n",
"\n",
"[6252 rows x 11 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"profiles_generated[2]['df']"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b4978f76-24d8-447a-b7d9-1506e817a47e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>title</th>\n",
" <th>description</th>\n",
" <th>frecency</th>\n",
" <th>url_hash</th>\n",
" <th>last_visit_date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>https://www.lifewire.com/extreme-winter-photog...</td>\n",
" <td>Extreme Winter Photography</td>\n",
" <td>How To ›Digital Cameras Extreme Winter Photogr...</td>\n",
" <td>962</td>\n",
" <td>97419828244965</td>\n",
" <td>1738272722935952</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>http://www.just-health.net/How-Much-Chia-Seed-...</td>\n",
" <td>How Many Chia Seeds Can You Have per Day?</td>\n",
" <td>How Many Chia Seeds Can You Have per Day? Chia...</td>\n",
" <td>3643</td>\n",
" <td>57133353541317</td>\n",
" <td>1738272722937953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://www.black-mold-guide.com/stachybotrys-m...</td>\n",
" <td>Stachybotrys Mold</td>\n",
" <td>Stachybotrys Mold Stachybotrys mold can be ver...</td>\n",
" <td>2035</td>\n",
" <td>37815498102075</td>\n",
" <td>1738272722939953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>http://purpleheart.forumotion.com/t512-the-dau...</td>\n",
" <td>The Daughter of Wolverine (a TMNT and X-Men fa...</td>\n",
" <td>thexblackxmaskedxninja Number of posts : 3893 ...</td>\n",
" <td>3933</td>\n",
" <td>14159477585427</td>\n",
" <td>1738272722941953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>https://www.last.fm/music/Baton+Rouge</td>\n",
" <td>Baton Rouge</td>\n",
" <td>Last.fm Search Live Music Events Features Join...</td>\n",
" <td>3221</td>\n",
" <td>21651773862014</td>\n",
" <td>1738272722943953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6247</th>\n",
" <td>https://www.timeout.com/newyork/style-design/t...</td>\n",
" <td>Find the best garden store in NYC</td>\n",
" <td>Photograph: Courtesy Verni's Brightening up cr...</td>\n",
" <td>4729</td>\n",
" <td>90212593791692</td>\n",
" <td>1738272735430826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6248</th>\n",
" <td>http://www.infobarrel.com/How_To_Get_Rid_of_Ho...</td>\n",
" <td>How To Get Rid of House Flies</td>\n",
" <td>Info Barrel > Home & Garden > Gardening & Yard...</td>\n",
" <td>3607</td>\n",
" <td>98111215616349</td>\n",
" <td>1738272735432826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6249</th>\n",
" <td>http://www.sbs.com.au/popasia/blog/2014/07/21/...</td>\n",
" <td>G-Dragon says he âwill get married soonâ on En...</td>\n",
" <td>Previous Next Show Grid Previous Next Hide Gri...</td>\n",
" <td>164</td>\n",
" <td>40500234243399</td>\n",
" <td>1738272735434826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6250</th>\n",
" <td>http://www.definitions.net/definition/pond</td>\n",
" <td>Definitions &Translations</td>\n",
" <td>Princeton's Word Net (0.00 / 0 votes)Rate this...</td>\n",
" <td>2985</td>\n",
" <td>25425027875974</td>\n",
" <td>1738272735436827</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6251</th>\n",
" <td>http://thehelpfulartteacher.blogspot.com/2013/...</td>\n",
" <td>LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE</td>\n",
" <td>LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE...</td>\n",
" <td>3921</td>\n",
" <td>53741128296830</td>\n",
" <td>1738272735438827</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6252 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"0 https://www.lifewire.com/extreme-winter-photog... \n",
"1 http://www.just-health.net/How-Much-Chia-Seed-... \n",
"2 http://www.black-mold-guide.com/stachybotrys-m... \n",
"3 http://purpleheart.forumotion.com/t512-the-dau... \n",
"4 https://www.last.fm/music/Baton+Rouge \n",
"... ... \n",
"6247 https://www.timeout.com/newyork/style-design/t... \n",
"6248 http://www.infobarrel.com/How_To_Get_Rid_of_Ho... \n",
"6249 http://www.sbs.com.au/popasia/blog/2014/07/21/... \n",
"6250 http://www.definitions.net/definition/pond \n",
"6251 http://thehelpfulartteacher.blogspot.com/2013/... \n",
"\n",
" title \\\n",
"0 Extreme Winter Photography \n",
"1 How Many Chia Seeds Can You Have per Day? \n",
"2 Stachybotrys Mold \n",
"3 The Daughter of Wolverine (a TMNT and X-Men fa... \n",
"4 Baton Rouge \n",
"... ... \n",
"6247 Find the best garden store in NYC \n",
"6248 How To Get Rid of House Flies \n",
"6249 G-Dragon says he âwill get married soonâ on En... \n",
"6250 Definitions &Translations \n",
"6251 LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE \n",
"\n",
" description frecency \\\n",
"0 How To ›Digital Cameras Extreme Winter Photogr... 962 \n",
"1 How Many Chia Seeds Can You Have per Day? Chia... 3643 \n",
"2 Stachybotrys Mold Stachybotrys mold can be ver... 2035 \n",
"3 thexblackxmaskedxninja Number of posts : 3893 ... 3933 \n",
"4 Last.fm Search Live Music Events Features Join... 3221 \n",
"... ... ... \n",
"6247 Photograph: Courtesy Verni's Brightening up cr... 4729 \n",
"6248 Info Barrel > Home & Garden > Gardening & Yard... 3607 \n",
"6249 Previous Next Show Grid Previous Next Hide Gri... 164 \n",
"6250 Princeton's Word Net (0.00 / 0 votes)Rate this... 2985 \n",
"6251 LINE, SHAPE, FORM, MOVEMENT, TEXTURE AND SPACE... 3921 \n",
"\n",
" url_hash last_visit_date \n",
"0 97419828244965 1738272722935952 \n",
"1 57133353541317 1738272722937953 \n",
"2 37815498102075 1738272722939953 \n",
"3 14159477585427 1738272722941953 \n",
"4 21651773862014 1738272722943953 \n",
"... ... ... \n",
"6247 90212593791692 1738272735430826 \n",
"6248 98111215616349 1738272735432826 \n",
"6249 40500234243399 1738272735434826 \n",
"6250 25425027875974 1738272735436827 \n",
"6251 53741128296830 1738272735438827 \n",
"\n",
"[6252 rows x 6 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"profiles_generated[2]['df'].drop(['topic','lang', 'domain', 'combined_text', 'themes'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "675b320b-1662-4f75-9176-f087d3e5cb56",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"profile_name = Contract Worker\n",
"categories = ['Jobs and Career', 'Finance', 'Business and Consumer Services']\n",
"history size = 6758\n",
"\n",
"profile_name = Urban Planner\n",
"categories = ['Law and Government', 'Community and Society', 'Home and Garden', 'Science and Education']\n",
"history size = 6778\n",
"\n",
"profile_name = Creative Hobbyist\n",
"categories = ['Hobbies and Leisure', 'Arts & Entertainment', 'Lifestyle', 'Home and Garden']\n",
"history size = 6252\n",
"\n",
"profile_name = Entertainment Junkie\n",
"categories = ['Arts & Entertainment', 'Games', 'News & Media Publishers', 'Sports']\n",
"history size = 3219\n",
"\n",
"profile_name = Entertainment Junkie\n",
"categories = ['Arts & Entertainment', 'Games', 'News & Media Publishers', 'Sports']\n",
"history size = 2638\n",
"\n",
"profile_name = School Teacher\n",
"categories = ['Science and Education', 'Reference Materials', 'Community and Society']\n",
"history size = 7131\n",
"\n",
"profile_name = Social Butterfly\n",
"categories = ['Community and Society', 'Arts & Entertainment', 'Travel and Tourism', 'News & Media Publishers']\n",
"history size = 7420\n",
"\n",
"profile_name = Health-Conscious Foodie\n",
"categories = ['Health', 'Food and Drink', 'Lifestyle', 'Home and Garden']\n",
"history size = 1956\n",
"\n"
]
}
],
"source": [
"for profile in profiles_generated:\n",
" print(f\"profile_name = {profile['profile_name']}\")\n",
" print(f\"categories = {profile['categories']}\")\n",
" print(f\"history size = {len(profile['df'])}\")\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d8cbe10-2a3c-4012-b662-a38ff9efa765",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 65,
"id": "567885ee-6b71-4995-8244-263b81d07172",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"PROFILES_DIR = \"../data/profiles/\"\n",
"os.makedirs(f\"{PROFILES_DIR}\", exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "f5ac6f21-5792-4396-a166-29105852dd2c",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def choose_profile(idx):\n",
" profile_name = profiles_generated[idx]['profile_name']\n",
" print(f\"profile chosen = {idx}; profile_name = {profile_name}\")\n",
" profile_file_name = profile_name.lower().replace(\" \",\"_\")\n",
" profile_history = profiles_generated[idx]['df']\\\n",
" .drop(['topic','lang', 'domain', 'combined_text', 'themes'], axis=1)\\\n",
" .sort_values('frecency', ascending=False)\\\n",
" .reset_index(drop=True)\n",
" \n",
" profile_history.to_csv(f\"{PROFILES_DIR}{profile_file_name}.csv\")\n",
" return profile_name, profile_history"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "10d9bb0f-8b18-480c-b3a6-3007c89cbecf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"profile chosen = 2; profile_name = Creative Hobbyist\n"
]
}
],
"source": [
"profile_name, profile_history = choose_profile(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a05ebe0-0864-417c-b074-dddfcf916058",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 74,
"id": "c7c8b0e8-a246-40dc-b254-d61554258338",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'queries': {'en': ['best DIY craft ideas for beginners',\n",
" 'how to paint with watercolors',\n",
" 'top knitting patterns for winter',\n",
" 'easy woodworking projects at home',\n",
" 'affordable art supplies online']}}"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user_profile_queries[profile_name]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "e2494ec7-392b-4b3f-9439-d94f2503a35e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>title</th>\n",
" <th>description</th>\n",
" <th>frecency</th>\n",
" <th>url_hash</th>\n",
" <th>last_visit_date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>http://www.ehow.com/how_5409995_cook-fennel.html</td>\n",
" <td>How to Cook Fennel</td>\n",
" <td>A decidedly odd-looking vegetable, fennel rese...</td>\n",
" <td>5000</td>\n",
" <td>41483952323940</td>\n",
" <td>1738272723614003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>http://www.phschool.com/language_arts/</td>\n",
" <td>Language Arts</td>\n",
" <td>Language Arts Student Resources Textbook Compa...</td>\n",
" <td>5000</td>\n",
" <td>38606764313101</td>\n",
" <td>1738272733950727</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>https://www.8notes.com/biographies/adams.asp</td>\n",
" <td>Items to buy by Bryan Adams</td>\n",
" <td>Items to buy by Bryan Adams (Everything I Do) ...</td>\n",
" <td>4999</td>\n",
" <td>30122553772281</td>\n",
" <td>1738272724722098</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>http://www.wikihow.com/Get-Rid-of-Mosquitoes-i...</td>\n",
" <td>How to Get Rid of Mosquitoes in Your Yard</td>\n",
" <td>1 Drain any areas with standing water. Mosquit...</td>\n",
" <td>4998</td>\n",
" <td>54238596538326</td>\n",
" <td>1738272735386823</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>http://www.goddessgift.com/goddess-myths/godde...</td>\n",
" <td>Goddess Symbols: Hestia</td>\n",
" <td>Goddess Symbols: Hestia Goddess Symbols and Sa...</td>\n",
" <td>4997</td>\n",
" <td>79098488697285</td>\n",
" <td>1738272735282816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6247</th>\n",
" <td>http://www.cnn.com/2008/TRAVEL/traveltips/06/2...</td>\n",
" <td>How to get airport lounge discounts</td>\n",
" <td>By Andrea Bennett (Travel + Leisure) -- The fo...</td>\n",
" <td>107</td>\n",
" <td>69832714863365</td>\n",
" <td>1738272729266412</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6248</th>\n",
" <td>https://www.diabetesselfmanagement.com/managin...</td>\n",
" <td>What Makes Blood Glucose Go Up or Down?</td>\n",
" <td>What Makes Blood Glucose Go Up or Down? Update...</td>\n",
" <td>106</td>\n",
" <td>31807037351241</td>\n",
" <td>1738272724750099</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6249</th>\n",
" <td>https://fit4less.ca/facts</td>\n",
" <td>Fit4Less Facts</td>\n",
" <td>Fit4Less Facts Membership Card You must have y...</td>\n",
" <td>105</td>\n",
" <td>25780086107407</td>\n",
" <td>1738272735062801</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6250</th>\n",
" <td>https://blog.onestoppoppyshoppe.com/articles/p...</td>\n",
" <td>One Stop Poppy Shoppe Blog</td>\n",
" <td>Poppy Flower Seeds – Germinating and Growing P...</td>\n",
" <td>103</td>\n",
" <td>39631050424751</td>\n",
" <td>1738272734902791</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6251</th>\n",
" <td>https://www.askmen.com/top_10/entertainment/to...</td>\n",
" <td>Top 10: Celebrity Roasts</td>\n",
" <td>Top 10: Celebrity Roasts Geoffrey Lansdell Sha...</td>\n",
" <td>100</td>\n",
" <td>22043045023199</td>\n",
" <td>1738272723780018</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6252 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"0 http://www.ehow.com/how_5409995_cook-fennel.html \n",
"1 http://www.phschool.com/language_arts/ \n",
"2 https://www.8notes.com/biographies/adams.asp \n",
"3 http://www.wikihow.com/Get-Rid-of-Mosquitoes-i... \n",
"4 http://www.goddessgift.com/goddess-myths/godde... \n",
"... ... \n",
"6247 http://www.cnn.com/2008/TRAVEL/traveltips/06/2... \n",
"6248 https://www.diabetesselfmanagement.com/managin... \n",
"6249 https://fit4less.ca/facts \n",
"6250 https://blog.onestoppoppyshoppe.com/articles/p... \n",
"6251 https://www.askmen.com/top_10/entertainment/to... \n",
"\n",
" title \\\n",
"0 How to Cook Fennel \n",
"1 Language Arts \n",
"2 Items to buy by Bryan Adams \n",
"3 How to Get Rid of Mosquitoes in Your Yard \n",
"4 Goddess Symbols: Hestia \n",
"... ... \n",
"6247 How to get airport lounge discounts \n",
"6248 What Makes Blood Glucose Go Up or Down? \n",
"6249 Fit4Less Facts \n",
"6250 One Stop Poppy Shoppe Blog \n",
"6251 Top 10: Celebrity Roasts \n",
"\n",
" description frecency \\\n",
"0 A decidedly odd-looking vegetable, fennel rese... 5000 \n",
"1 Language Arts Student Resources Textbook Compa... 5000 \n",
"2 Items to buy by Bryan Adams (Everything I Do) ... 4999 \n",
"3 1 Drain any areas with standing water. Mosquit... 4998 \n",
"4 Goddess Symbols: Hestia Goddess Symbols and Sa... 4997 \n",
"... ... ... \n",
"6247 By Andrea Bennett (Travel + Leisure) -- The fo... 107 \n",
"6248 What Makes Blood Glucose Go Up or Down? Update... 106 \n",
"6249 Fit4Less Facts Membership Card You must have y... 105 \n",
"6250 Poppy Flower Seeds – Germinating and Growing P... 103 \n",
"6251 Top 10: Celebrity Roasts Geoffrey Lansdell Sha... 100 \n",
"\n",
" url_hash last_visit_date \n",
"0 41483952323940 1738272723614003 \n",
"1 38606764313101 1738272733950727 \n",
"2 30122553772281 1738272724722098 \n",
"3 54238596538326 1738272735386823 \n",
"4 79098488697285 1738272735282816 \n",
"... ... ... \n",
"6247 69832714863365 1738272729266412 \n",
"6248 31807037351241 1738272724750099 \n",
"6249 25780086107407 1738272735062801 \n",
"6250 39631050424751 1738272734902791 \n",
"6251 22043045023199 1738272723780018 \n",
"\n",
"[6252 rows x 6 columns]"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"profile_history"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8dd3a413-e2ba-4954-bb75-7915f9180296",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}