augmentation/metrics.py (35 lines of code) (raw):
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import sacrebleu
from typing import Set
STEMMER = SnowballStemmer("english")
def jaccard_score(set_1: Set, set_2: Set):
if len(set_1) == 0 and len(set_2) == 0:
return 1.0
intersection_size = len(set_1.intersection(set_2))
union_size = len(set_1) + len(set_2) - intersection_size
return intersection_size / union_size
def jaccard_ngrams(text_1: str, text_2: str, n: int = 1, stem: bool = False):
def identity(words):
return words
def stem_words(words):
return [STEMMER.stem(w) for w in words]
stemming_fn = stem_words if stem else identity
text_1_ngrams = set(
ngrams(stemming_fn(word_tokenize(text_1)), n)
)
text_2_ngrams = set(
ngrams(stemming_fn(word_tokenize(text_2)), n)
)
return jaccard_score(text_1_ngrams, text_2_ngrams)
def single_reference_sentence_bleu(reference: str, variant: str, stem: bool = False):
def stem_sentence(sentence):
return " ".join([
STEMMER.stem(w)
for w in word_tokenize(sentence)
])
if stem:
variant = stem_sentence(variant)
reference = stem_sentence(reference)
return sacrebleu.sentence_bleu(variant, [reference]).score