in reference/src/main/python/similar.py [0:0]
def get_completions_via_clustering(query_record, similar_records):
features = [feature_list_to_doc(record) for record in similar_records]
if len(features) > 1:
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(features)
if config.USE_DBSCAN:
db = DBSCAN(eps=config.DBSCAN_EPS, min_samples=2, metric="cosine")
labels = db.fit_predict(X)
else:
db = AffinityPropagation()
labels = db.fit_predict(X)
else:
labels = [0]
print(f"Clustering labels: {labels}")
logging.info(f"Clustering labels: {labels}")
index_pairs = OrderedDict()
ret = []
n_clusters = 0
n_uniques = 0
for i in range(min(config.MIN_MERGED_CODE, len(similar_records))):
if labels[i] < 0:
ret.append((similar_records[i]["ast"], i, i))
for i in range(len(labels)):
if labels[i] >= 0:
if labels[i] in index_pairs:
if len(index_pairs[labels[i]]) == 1:
index_pairs[labels[i]].append(i)
else:
index_pairs[labels[i]] = [i]
n_clusters += 1
else:
n_uniques += 1
for p in index_pairs.values():
if len(p) == 2:
(i, j) = p
pruned_record = prune_last_jd(
[query_record, similar_records[j]], similar_records[i]
)
ret.append((pruned_record, i, j))
else:
ret.append((similar_records[p[0]]["ast"], p[0], p[0]))
ret.sort(key=lambda t: t[1])
logging.info(
f"(# similars, #clusters, #singles, #completions) = ({len(similar_records)}, {n_clusters}, {n_uniques}, {len(ret)})"
)
print(
f"(# similars, #clusters, #singles, #completions) = ({len(similar_records)}, {n_clusters}, {n_uniques}, {len(ret)})"
)
return ret