in entity_linking.py [0:0]
def bi_jaccard_similarity(a, b):
# Add start and end tokens
a, b = '§' + a.lower() + '±', '§' + b.lower() + '±'
# Exactly the same string
if a == b: return 1.0
# Find a multiset of bigrams of each string using Counters
a_bigrams = Counter([a[i:i + 2] for i in range(len(a) - 1)])
b_bigrams = Counter([b[i:i + 2] for i in range(len(b) - 1)])
# Intersection over union (in a multiset way) using Counters
return sum((a_bigrams & b_bigrams).values()) / sum((a_bigrams | b_bigrams).values())