notebooks/AnalyzeTrainingData.ipynb (195 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "a450e000-91f7-4a0c-a3b8-c13d449f17fe",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "6f310eb2-14db-47a8-a8a6-e2353d570c28",
"metadata": {},
"source": [
"### This code clusters the training data so some of it can be re-labeled and use in a multi-shot training flow"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b339b641-b4dd-4893-9c91-a3e3c59e1e59",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0067ab2a-96d3-4a50-97cb-a1c307a89a5f",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.cluster import KMeans\n",
"from bertopic import BERTopic"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79626719-5913-457a-8b54-bef37cf36ff3",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"../data/external/common_crawl.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5954814-a3d7-48b4-b822-bd71c0f53351",
"metadata": {},
"outputs": [],
"source": [
"cluster_model = KMeans(n_clusters=45)\n",
"topic_model = BERTopic(hdbscan_model=cluster_model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "085f46fe-1ac6-4b47-9676-5145eff15ccd",
"metadata": {},
"outputs": [],
"source": [
"from dotenv import load_dotenv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "21fefd42-5cdd-4cf0-af64-2698220ad638",
"metadata": {},
"outputs": [],
"source": [
"df= df.fillna(\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4e42a9a-eb00-40a6-9549-4fd480887a30",
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "707864dd-0418-4cc3-92bf-208c9b1b37ad",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f895d7e-2288-4d4c-8c26-023aaea667cc",
"metadata": {},
"outputs": [],
"source": [
"topics, probs = topic_model.fit_transform(df.title)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0ff70cdf-0de4-4605-96a5-0d273b4c6376",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "657cb274-524f-4cf0-9957-3d222b8bbe17",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab3f9b77-9143-47cf-8c5d-bca997f81b9d",
"metadata": {},
"outputs": [],
"source": [
"len(pd.DataFrame({\"topics\": topics}).topics.unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a749246-9a62-4c06-98a4-7cd36eccaed2",
"metadata": {},
"outputs": [],
"source": [
"[print(a) for a in topic_model.get_topic_info()[\"Name\"].to_list()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "494d20a4-d2e3-4d51-aee2-f5d9ae8db96e",
"metadata": {},
"outputs": [],
"source": [
"df[\"assigned_topic\"] = topics"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f263907-38f9-4e17-842f-a5943ced8777",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"../test_data/topic_fine_tuning_data__01_05__grouped.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "418c7c48-8d79-4bc5-a194-fa2c33d563fa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}