in src/util/grouping_pipeline.py [0:0]
def generate_best_cluster_model(embeddings, cluster_space, verbose=False, use_dbscan=True, eps=0.3,
num_cluster_method="knee"):
"""
takes embeddings and returns the best model
using the elbow method and kmeans
cluster_space is the range to search for the best cluster - e.g. range(1, 50)
This can take a while to run for large datasets
"""
from sklearn.cluster import DBSCAN
# HDBSCAN or another clustering algorithm that has .fit and .predict functions and
# the .labels_ variable to extract the labels
# self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(
# min_cluster_size=self.min_topic_size,
# metric="euclidean",
# cluster_selection_method="eom",
# prediction_data=True,
# )
if use_dbscan:
db = hdbscan.HDBSCAN(
min_cluster_size=2,
metric="euclidean",
cluster_selection_method="eom",
prediction_data=True
)
# db = DBSCAN(eps=eps, min_samples=2, metric='cosine').fit_predict(embeddings)
# db = DBSCAN(eps=eps, min_samples=2, metric='euclidian').fit_predict(embeddings)
return db
if num_cluster_method == "knee":
with warnings.catch_warnings():
warnings.simplefilter("ignore")
sum_of_squared_distances = []
k_to_model = {}
for k in cluster_space:
if k > len(embeddings):
break
model_k = KMeans(n_clusters=k).fit(embeddings)
sum_of_squared_distances.append(model_k.inertia_)
k_to_model[k] = model_k
if verbose:
print(k, model_k.inertia_)
kn = KneeLocator(
cluster_space,
sum_of_squared_distances,
curve='convex',
direction='decreasing',
interp_method='interp1d',
)
if verbose:
print('Best number of clusters: {}'.format(kn.knee))
# kn.knee returns optimal cluster value
if kn.knee is None:
print("Warning -- knee not found -- defaulting to 4")
return k_to_model[4]
return k_to_model[kn.knee]
else:
k = silh_find_optimal_k(embeddings, cluster_space)
return KMeans(n_clusters=k).fit(embeddings)