def hdbscan_clustering()

in tools/visualize.py [0:0]


def hdbscan_clustering(umap_data, train_data, test_data, info="", output_dir=""):

    clusterer       = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500, prediction_data=True)
    umap_labels     = clusterer.fit_predict(umap_data)
    train_labels, _ = hdbscan.approximate_predict(clusterer, train_data)
    test_labels,  _ = hdbscan.approximate_predict(clusterer, test_data)

    fig, ((ax00, ax01, ax02), (ax10, ax11, ax12)) = plt.subplots(2, 3)
    fig.suptitle("HDBSCAN clastering: "+ info )

    # plot umap data
    umap_clustered = (umap_labels >= 0)
    umap_coll = collections.Counter(umap_clustered)
    print("umap_clustered", umap_coll)
#    print("umap_data", umap_data.shape)
#    print("~umap_clustered", umap_clustered.count(False), ~umap_clustered)
    ax00.scatter(umap_data[~umap_clustered, 0],
                 umap_data[~umap_clustered, 1],
                 c=(0.5, 0.5, 0.5),
                 s=0.1,
                 alpha=0.5)
    ax00.set_title("UMAP Outliers " + str(umap_coll[False]), fontsize=7)
    ax10.scatter(umap_data[umap_clustered, 0],
                 umap_data[umap_clustered, 1],
                 c=umap_labels[umap_clustered],
                 s=0.1,
                 cmap="Spectral")
    ax10.set_title("UMAP Inliers " + str(umap_coll[True]), fontsize=7)
    
    # plot train data
    train_clustered = (train_labels >= 0)
    train_coll = collections.Counter(train_clustered)
    ax01.scatter(train_data[~train_clustered, 0],
                 train_data[~train_clustered, 1],
                 c=(0.5, 0.5, 0.5),
                 s=0.1,
                 alpha=0.5)
    ax01.set_title("Train Outliers " + str(train_coll[False]), fontsize=7)
    ax11.scatter(train_data[train_clustered, 0],
                 train_data[train_clustered, 1],
                 c=train_labels[train_clustered],
                 s=0.1,
                 cmap="Spectral")
    ax11.set_title("Train Inliers " + str(train_coll[True]), fontsize=7)
    
    # plot test data
    test_clustered = (test_labels >= 0)
    test_coll = collections.Counter(test_clustered)
    ax02.scatter(test_data[~test_clustered, 0],
                 test_data[~test_clustered, 1],
                 c=(0.5, 0.5, 0.5),
                 s=0.1,
                 alpha=0.5)
    ax02.set_title("Tets Outliers " + str(test_coll[False]), fontsize=7)
    ax12.scatter(test_data[test_clustered, 0],
                 test_data[test_clustered, 1],
                 c=test_labels[test_clustered],
                 s=0.1,
                 cmap="Spectral")
    ax12.set_title("Test Inliers " + str(test_coll[True]), fontsize=7)
    
    plt.savefig(output_dir+"/"+info+"-hdbscan.png")
    plt.close()