in src/evaluate/evaluator/token_classification.py [0:0]
def prepare_data(self, data: Union[str, Dataset], input_column: str, label_column: str, join_by: str):
super().prepare_data(data, input_column, label_column)
if not isinstance(data.features[input_column], Sequence) or not isinstance(
data.features[label_column], Sequence
):
raise ValueError(
"TokenClassificationEvaluator expects the input and label columns to be provided as lists."
)
# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.
# Otherwise, we have to get the list of labels manually.
labels_are_int = isinstance(data.features[label_column].feature, ClassLabel)
if labels_are_int:
label_list = data.features[label_column].feature.names # list of string labels
id_to_label = {i: label for i, label in enumerate(label_list)}
references = [[id_to_label[label_id] for label_id in label_ids] for label_ids in data[label_column]]
elif data.features[label_column].feature.dtype.startswith("int"):
raise NotImplementedError(
"References provided as integers, but the reference column is not a Sequence of ClassLabels."
)
else:
# In the event the labels are not a `Sequence[ClassLabel]`, we have already labels as strings
# An example is labels as ["PER", "PER", "O", "LOC", "O", "LOC", "O"], e.g. in polyglot_ner dataset
references = data[label_column]
metric_inputs = {"references": references}
data = data.map(lambda x: {input_column: join_by.join(x[input_column])})
pipeline_inputs = DatasetColumn(data, input_column)
return metric_inputs, pipeline_inputs