in src/setfit/span/modeling.py [0:0]
def gold_aspect_spans_to_aspects_list(self, inputs: Dataset) -> List[List[slice]]:
# First group inputs by text
grouped_data = defaultdict(list)
for sample in inputs:
text = sample.pop("text")
grouped_data[text].append(sample)
# Get the spaCy docs
docs, _ = self.aspect_extractor(grouped_data.keys())
# Get the aspect spans for each doc by matching gold spans to the spaCy tokens
aspects_list = []
index = -1
skipped_indices = []
for doc, samples in zip(docs, grouped_data.values()):
aspects_list.append([])
for sample in samples:
index += 1
match_objects = re.finditer(re.escape(sample["span"]), doc.text)
for i, match in enumerate(match_objects):
if i == sample["ordinal"]:
char_idx_start = match.start()
char_idx_end = match.end()
span = doc.char_span(char_idx_start, char_idx_end)
if span is None:
logger.warning(
f"Aspect term {sample['span']!r} with ordinal {sample['ordinal']}, isn't a token in {doc.text!r} according to spaCy. "
"Skipping this sample."
)
skipped_indices.append(index)
continue
aspects_list[-1].append(slice(span.start, span.end))
return docs, aspects_list, skipped_indices