in src/lighteval/metrics/imports/data_stats_metric.py [0:0]
def evaluate_example(self, summary, input_text):
if self.tokenize:
input_text = _en(input_text, disable=["tagger", "parser", "ner", "textcat"])
input_text = [tok.text for tok in input_text]
summary = _en(summary, disable=["tagger", "parser", "ner", "textcat"])
summary = [tok.text for tok in summary]
fragments = Fragments(summary, input_text, case=self.case)
coverage = fragments.coverage()
density = fragments.density()
compression = fragments.compression()
score_dict = {"coverage": coverage, "density": density, "compression": compression}
tokenized_summary = fragments.summary
tokenized_text = fragments.text
score_dict["summary_length"] = len(tokenized_summary)
for i in range(1, self.n_gram + 1):
input_ngrams = list(find_ngrams(tokenized_text, i))
summ_ngrams = list(find_ngrams(tokenized_summary, i))
input_ngrams_set = set(input_ngrams)
summ_ngrams_set = set(summ_ngrams)
intersect = summ_ngrams_set.intersection(input_ngrams_set)
try:
score_dict[f"percentage_novel_{i}-gram"] = (len(summ_ngrams_set) - len(intersect)) / float(
len(summ_ngrams_set)
)
ngramCounter = Counter()
ngramCounter.update(summ_ngrams)
repeated = [key for key, val in ngramCounter.items() if val > 1]
score_dict[f"percentage_repeated_{i}-gram_in_summ"] = len(repeated) / float(len(summ_ngrams_set))
except ZeroDivisionError:
continue
return score_dict