def get_completions_via_clustering()

in reference/src/main/python/similar.py [0:0]


def get_completions_via_clustering(query_record, similar_records):
    features = [feature_list_to_doc(record) for record in similar_records]
    if len(features) > 1:
        vectorizer = CountVectorizer(min_df=1)
        X = vectorizer.fit_transform(features)
        if config.USE_DBSCAN:
            db = DBSCAN(eps=config.DBSCAN_EPS, min_samples=2, metric="cosine")
            labels = db.fit_predict(X)
        else:
            db = AffinityPropagation()
            labels = db.fit_predict(X)
    else:
        labels = [0]

    print(f"Clustering labels: {labels}")
    logging.info(f"Clustering labels: {labels}")
    index_pairs = OrderedDict()
    ret = []
    n_clusters = 0
    n_uniques = 0
    for i in range(min(config.MIN_MERGED_CODE, len(similar_records))):
        if labels[i] < 0:
            ret.append((similar_records[i]["ast"], i, i))
    for i in range(len(labels)):
        if labels[i] >= 0:
            if labels[i] in index_pairs:
                if len(index_pairs[labels[i]]) == 1:
                    index_pairs[labels[i]].append(i)
            else:
                index_pairs[labels[i]] = [i]
                n_clusters += 1
        else:
            n_uniques += 1

    for p in index_pairs.values():
        if len(p) == 2:
            (i, j) = p
            pruned_record = prune_last_jd(
                [query_record, similar_records[j]], similar_records[i]
            )
            ret.append((pruned_record, i, j))
        else:
            ret.append((similar_records[p[0]]["ast"], p[0], p[0]))

    ret.sort(key=lambda t: t[1])
    logging.info(
        f"(# similars, #clusters, #singles, #completions) = ({len(similar_records)}, {n_clusters}, {n_uniques}, {len(ret)})"
    )
    print(
        f"(# similars, #clusters, #singles, #completions) = ({len(similar_records)}, {n_clusters}, {n_uniques}, {len(ret)})"
    )
    return ret