in comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/EntityRecognizer/groundtruth_to_comprehend_format_converter.py [0:0]
def convert_to_dataset_annotations(self, index, jsonLine):
# parse the jsonLine to generate the dataset entry
jsonObj = self.parse_manifest_input(jsonLine)
if SOURCE not in jsonObj:
raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
file_name=self.groundtruth_manifest_file_name))
source = jsonObj[SOURCE]
if len(source.encode('utf-8')) > MAX_TRAIN_DOC_SIZE:
raise Exception(DOC_SIZE_EXCEEDED.substitute(file=self.groundtruth_manifest_file_name,
line=index,
size=MAX_TRAIN_DOC_SIZE))
self.maximum_offset = len(source.encode('utf-8'))
# parse the jsonLine to generate the annotations entry
annotations = []
self.labeling_job_name = self.get_labeling_job_name(index, jsonObj)
number_of_labels = len(jsonObj[self.labeling_job_name][ANNOTATIONS][ENTITIES])
labeling_job_info = jsonObj[self.labeling_job_name][ANNOTATIONS][ENTITIES]
for ind in range(number_of_labels):
begin_offset = int(labeling_job_info[ind][START_OFFSET])
end_offset = int(labeling_job_info[ind][END_OFFSET])
label = labeling_job_info[ind][LABEL]
if end_offset < begin_offset:
raise Exception(WRONG_ANNOTATION.substitute(file_name=self.groundtruth_manifest_file_name,
line=int(index),
begin_offset=begin_offset,
end_offset=end_offset,
message=INVALID_END_OFFSET))
if (begin_offset >= self.maximum_offset) or (end_offset > self.maximum_offset):
raise Exception(INVALID_OFFSETS.substitute(doc=self.groundtruth_manifest_file_name,
line_index=index,
begin_offset=begin_offset,
end_offset=end_offset,
line_size=self.maximum_offset))
annotations.append((self.input_file_name, index, begin_offset, end_offset, label))
self._check_for_overlapping_annotations(annotations)
return source, annotations