def is_similar()

in distant_supervision/text_preprocessor.py [0:0]


    def is_similar(self, sent1, sent2, f1_cutoff, *, discard_stopwords):
        """
        Based on bag of words.

        :param discard_stopwords: remove stopwords and lowercasing
        """
        if sent1 == sent2:
            return True

        if discard_stopwords:
            tokens1 = self.clean_and_tokenize_str(sent1)
            tokens2 = self.clean_and_tokenize_str(sent2)
        else:
            tokens1 = set(sent1.strip().split())
            tokens2 = set(sent2.strip().split())

        eps = 1e-100

        score1 = float(len(tokens1 & tokens2)) / (len(tokens1) + eps)
        score2 = float(len(tokens1 & tokens2)) / (len(tokens2) + eps)

        f1 = statistics.harmonic_mean([score1, score2])
        return f1 >= f1_cutoff