in comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/groundtruth_to_comprehend_clr_format_converter.py [0:0]
def convert_to_multilabel_dataset(self, index, jsonLine, label_delimiter):
self.label_delimiter = label_delimiter
jsonLine_object = self._parse_manifest_input(index, jsonLine)
if jsonLine_object is not None:
if SOURCE not in jsonLine_object.keys():
raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
file_name=self.groundtruth_manifest_file_name))
source = jsonLine_object[SOURCE]
self._check_document_size(source, index, limits=default_limits)
self.labeling_job_name = self.get_labeling_job_name(index, jsonLine_object)
if CLASS_MAP not in jsonLine_object[self.labeling_job_name].keys():
raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
file_name=self.groundtruth_manifest_file_name))
class_map = jsonLine_object[self.labeling_job_name][CLASS_MAP]
# Raise CustomerError when no label found for the document
if len(class_map) == 0:
raise Exception(EMPTY_LABEL_UNSUPPORTED.substitute(filename=self.groundtruth_manifest_file_name))
# Raise CustomerError if label size is more than 5000 characters
for label in class_map.values():
self._check_label_size(label, index, limits=default_limits)
labels = self._get_labels(class_map)
# Raise Customer error when empty label found in the list of labels
label_list = labels.split(self.label_delimiter)
for label in label_list:
if len(label) == 0:
raise Exception(EMPTY_LABEL_FOUND.substitute(line=index,
file=self.groundtruth_manifest_file_name))
return labels, source