in src/lighteval/metrics/imports/data_stats_utils.py [0:0]
def strings(self, min_length=0, summary_base=True):
"""
Return a list of explicit match strings between the summary and reference.
Note that this will be in the same format as the strings are input. This is
important to remember if tokenization is done manually. If tokenization is
specified automatically on the raw strings, raw strings will automatically
be returned rather than SpaCy tokenized sequences.
Arguments:
- min_length (int): filter out overlaps shorter than this (default = 0)
- raw (bool): return raw input rather than stringified
- (default = False if automatic tokenization, True otherwise)
- summary_base (true): strings are based of summary text (default = True)
Returns:
- list of overlaps, where overlaps are strings or token sequences
"""
# Compute the strings against the summary or the text?
base = self.summary if summary_base else self.text
# Generate strings, filtering out strings below the minimum length.
strings = [base[i : i + length] for i, j, length in self.overlaps() if length > min_length]
# By default, we just return the tokenization being used.
# But if they user wants a raw string, then we convert.
# Mostly, this will be used along with spacy.
# if self._tokens and raw:
# for i, s in enumerate(strings):
# strings[i] = str(s)
# Return the list of strings.
return strings