in tools/visualize.py [0:0]
def hdbscan_clustering(umap_data, train_data, test_data, info="", output_dir=""):
clusterer = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500, prediction_data=True)
umap_labels = clusterer.fit_predict(umap_data)
train_labels, _ = hdbscan.approximate_predict(clusterer, train_data)
test_labels, _ = hdbscan.approximate_predict(clusterer, test_data)
fig, ((ax00, ax01, ax02), (ax10, ax11, ax12)) = plt.subplots(2, 3)
fig.suptitle("HDBSCAN clastering: "+ info )
# plot umap data
umap_clustered = (umap_labels >= 0)
umap_coll = collections.Counter(umap_clustered)
print("umap_clustered", umap_coll)
# print("umap_data", umap_data.shape)
# print("~umap_clustered", umap_clustered.count(False), ~umap_clustered)
ax00.scatter(umap_data[~umap_clustered, 0],
umap_data[~umap_clustered, 1],
c=(0.5, 0.5, 0.5),
s=0.1,
alpha=0.5)
ax00.set_title("UMAP Outliers " + str(umap_coll[False]), fontsize=7)
ax10.scatter(umap_data[umap_clustered, 0],
umap_data[umap_clustered, 1],
c=umap_labels[umap_clustered],
s=0.1,
cmap="Spectral")
ax10.set_title("UMAP Inliers " + str(umap_coll[True]), fontsize=7)
# plot train data
train_clustered = (train_labels >= 0)
train_coll = collections.Counter(train_clustered)
ax01.scatter(train_data[~train_clustered, 0],
train_data[~train_clustered, 1],
c=(0.5, 0.5, 0.5),
s=0.1,
alpha=0.5)
ax01.set_title("Train Outliers " + str(train_coll[False]), fontsize=7)
ax11.scatter(train_data[train_clustered, 0],
train_data[train_clustered, 1],
c=train_labels[train_clustered],
s=0.1,
cmap="Spectral")
ax11.set_title("Train Inliers " + str(train_coll[True]), fontsize=7)
# plot test data
test_clustered = (test_labels >= 0)
test_coll = collections.Counter(test_clustered)
ax02.scatter(test_data[~test_clustered, 0],
test_data[~test_clustered, 1],
c=(0.5, 0.5, 0.5),
s=0.1,
alpha=0.5)
ax02.set_title("Tets Outliers " + str(test_coll[False]), fontsize=7)
ax12.scatter(test_data[test_clustered, 0],
test_data[test_clustered, 1],
c=test_labels[test_clustered],
s=0.1,
cmap="Spectral")
ax12.set_title("Test Inliers " + str(test_coll[True]), fontsize=7)
plt.savefig(output_dir+"/"+info+"-hdbscan.png")
plt.close()