in data_measurements/labels/labels.py [0:0]
def prepare_labels(self, label_field, label_names=[]):
""" Uses the evaluate library to return the label distribution. """
logs.info("Inside main label calculation function.")
logs.debug("Looking for label field called '%s'" % label_field)
# The input Dataset object
# When the label field is not found, an error will be thrown.
if label_field in self.dset.features:
label_list = self.dset[label_field]
else:
logs.warning("No label column found -- nothing to do. Returning.")
logs.debug(self.dset.features)
return {}
# Get the evaluate library's measurement for label distro.
label_distribution = evaluate.load(EVAL_LABEL_MEASURE)
# Measure the label distro.
label_measurement = label_distribution.compute(data=label_list)
# TODO: Incorporate this summation into what the evaluate library returns.
label_sum_dict = Counter(label_list)
label_sums = [label_sum_dict[key] for key in sorted(label_sum_dict)]
label_measurement["sums"] = label_sums
if not label_names:
# Have to extract the label names from the Dataset object when the
# actual dataset columns are just ints representing the label names.
label_names = extract_label_names(label_field, self.ds_name,
self.config_name)
label_results = make_label_results_dict(label_measurement, label_names)
return label_results