def semantic_cluster()

in backup/analysis.py [0:0]


def semantic_cluster(data: Data):
    """
    Idea: Cluster in Semantic space and measure the change in conversation
    :param data:
    :return:
    """
    labels = []
    tokens = []

    for di, dial in enumerate(data.dial_vecs):
        tokens.extend([d.numpy() for d in dial])
        labels.extend(["{}.{}".format(di, p) for p in range(len(dial))])

    tokens = np.array(tokens)

    kmeans_clustering = KMeans(n_clusters=5)
    idx = kmeans_clustering.fit_predict(tokens)
    clusters = list(range(5))
    cos_distances = {}
    eu_distances = {}
    cb_distances = {}
    for i in it.permutations(clusters, 2):
        cos_distances[i] = cosine_similarity(
            kmeans_clustering.cluster_centers_[i[0]],
            kmeans_clustering.cluster_centers_[i[1]],
        )
        eu_distances[i] = euclidean(
            kmeans_clustering.cluster_centers_[i[0]],
            kmeans_clustering.cluster_centers_[i[1]],
        )
        cb_distances[i] = cityblock(
            kmeans_clustering.cluster_centers_[i[0]],
            kmeans_clustering.cluster_centers_[i[1]],
        )

    # compute
    dist_traversed = []
    flips = []
    prev_di = ""
    dt = 0
    fp = 0
    pc = -1
    for ti, tok in enumerate(tokens):
        di, ui = labels[ti].split(".")
        center = kmeans_clustering.labels_[ti]
        if prev_di == di:
            if pc > -1:
                if pc != center:
                    dt += cos_distances[(pc, center)]
                    fp += 1
        else:
            if dt > 0:
                dist_traversed.append(dt)
                flips.append(fp)
                dt = 0
                fp = 0
            prev_di = di
        pc = center

    print("Flips : Mean : {}, STD: {}".format(np.mean(flips), np.std(flips)))
    print(
        "Distance traversed : Mean : {}, STD : {}".format(
            np.mean(dist_traversed), np.std(dist_traversed)
        )
    )