def generate_best_cluster_model()

in src/util/grouping_pipeline.py [0:0]


def generate_best_cluster_model(embeddings, cluster_space, verbose=False, use_dbscan=True, eps=0.3,
                                num_cluster_method="knee"):
    """
    takes embeddings and returns the best model
    using the elbow method and kmeans
    cluster_space is the range to search for the best cluster - e.g. range(1, 50)
    This can take a while to run for large datasets
    """
    from sklearn.cluster import DBSCAN

    # HDBSCAN or another clustering algorithm that has .fit and .predict functions and
    # the .labels_ variable to extract the labels
    #    self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(
    #        min_cluster_size=self.min_topic_size,
    #        metric="euclidean",
    #        cluster_selection_method="eom",
    #        prediction_data=True,
    #    )

    if use_dbscan:
        db = hdbscan.HDBSCAN(
            min_cluster_size=2,
            metric="euclidean",
            cluster_selection_method="eom",
            prediction_data=True
        )
        #        db = DBSCAN(eps=eps, min_samples=2, metric='cosine').fit_predict(embeddings)
        #        db = DBSCAN(eps=eps, min_samples=2, metric='euclidian').fit_predict(embeddings)
        return db

    if num_cluster_method == "knee":
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            sum_of_squared_distances = []
            k_to_model = {}
            for k in cluster_space:
                if k > len(embeddings):
                    break
                model_k = KMeans(n_clusters=k).fit(embeddings)
                sum_of_squared_distances.append(model_k.inertia_)
                k_to_model[k] = model_k
                if verbose:
                    print(k, model_k.inertia_)

            kn = KneeLocator(
                cluster_space,
                sum_of_squared_distances,
                curve='convex',
                direction='decreasing',
                interp_method='interp1d',
            )
            if verbose:
                print('Best number of clusters: {}'.format(kn.knee))

            # kn.knee returns optimal cluster value
            if kn.knee is None:
                print("Warning -- knee not found -- defaulting to 4")
                return k_to_model[4]
            return k_to_model[kn.knee]
    else:
        k = silh_find_optimal_k(embeddings, cluster_space)
        return KMeans(n_clusters=k).fit(embeddings)