def bi_jaccard_similarity()

in entity_linking.py [0:0]


def bi_jaccard_similarity(a, b):
    # Add start and end tokens
    a, b = '§' + a.lower() + '±', '§' + b.lower() + '±'

    # Exactly the same string
    if a == b: return 1.0

    # Find a multiset of bigrams of each string using Counters
    a_bigrams = Counter([a[i:i + 2] for i in range(len(a) - 1)])
    b_bigrams = Counter([b[i:i + 2] for i in range(len(b) - 1)])

    # Intersection over union (in a multiset way) using Counters
    return sum((a_bigrams & b_bigrams).values()) / sum((a_bigrams | b_bigrams).values())