in src/setfit/model_card.py [0:0]
def set_train_set_metrics(self, dataset: Dataset) -> None:
def add_naive_word_count(sample: Dict[str, Any]) -> Dict[str, Any]:
sample["word_count"] = len(sample["text"].split(" "))
return sample
dataset = dataset.map(add_naive_word_count)
self.train_set_metrics_list = [
{
"Training set": "Word count",
"Min": min(dataset["word_count"]),
"Median": sum(dataset["word_count"]) / len(dataset),
"Max": max(dataset["word_count"]),
},
]
# E.g. if unlabeled via DistillationTrainer
if "label" not in dataset.column_names:
return
sample_label = dataset[0]["label"]
if isinstance(sample_label, collections.abc.Sequence) and not isinstance(sample_label, str):
return
try:
counter = Counter(dataset["label"])
if self.model.labels:
self.train_set_sentences_per_label_list = [
{
"Label": str_label,
"Training Sample Count": counter[
str_label if isinstance(sample_label, str) else self.model.label2id[str_label]
],
}
for str_label in self.model.labels
]
else:
self.train_set_sentences_per_label_list = [
{
"Label": (
self.model.labels[label] if self.model.labels and isinstance(label, int) else str(label)
),
"Training Sample Count": count,
}
for label, count in sorted(counter.items())
]
except Exception:
# There are some tricky edge cases possible, e.g. if the user provided integer labels that do not fall
# between 0 to num_classes-1, so we make sure we never cause errors.
pass