in distant_supervision/text_preprocessor.py [0:0]
def is_similar(self, sent1, sent2, f1_cutoff, *, discard_stopwords):
"""
Based on bag of words.
:param discard_stopwords: remove stopwords and lowercasing
"""
if sent1 == sent2:
return True
if discard_stopwords:
tokens1 = self.clean_and_tokenize_str(sent1)
tokens2 = self.clean_and_tokenize_str(sent2)
else:
tokens1 = set(sent1.strip().split())
tokens2 = set(sent2.strip().split())
eps = 1e-100
score1 = float(len(tokens1 & tokens2)) / (len(tokens1) + eps)
score2 = float(len(tokens1 & tokens2)) / (len(tokens2) + eps)
f1 = statistics.harmonic_mean([score1, score2])
return f1 >= f1_cutoff