def convert_to_dataset_annotations()

in comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/EntityRecognizer/groundtruth_to_comprehend_format_converter.py [0:0]


    def convert_to_dataset_annotations(self, index, jsonLine):
        # parse the jsonLine to generate the dataset entry
        jsonObj = self.parse_manifest_input(jsonLine)
        if SOURCE not in jsonObj:
            raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
                                                                       file_name=self.groundtruth_manifest_file_name))
        source = jsonObj[SOURCE]
        if len(source.encode('utf-8')) > MAX_TRAIN_DOC_SIZE:
            raise Exception(DOC_SIZE_EXCEEDED.substitute(file=self.groundtruth_manifest_file_name,
                                                         line=index,
                                                         size=MAX_TRAIN_DOC_SIZE))
        self.maximum_offset = len(source.encode('utf-8'))

        # parse the jsonLine to generate the annotations entry
        annotations = []
        
        self.labeling_job_name = self.get_labeling_job_name(index, jsonObj)
        number_of_labels = len(jsonObj[self.labeling_job_name][ANNOTATIONS][ENTITIES])
        labeling_job_info = jsonObj[self.labeling_job_name][ANNOTATIONS][ENTITIES]
        for ind in range(number_of_labels):
            begin_offset = int(labeling_job_info[ind][START_OFFSET])
            end_offset = int(labeling_job_info[ind][END_OFFSET])
            label = labeling_job_info[ind][LABEL]
            if end_offset < begin_offset:
                raise Exception(WRONG_ANNOTATION.substitute(file_name=self.groundtruth_manifest_file_name,
                                                            line=int(index),
                                                            begin_offset=begin_offset,
                                                            end_offset=end_offset,
                                                            message=INVALID_END_OFFSET))
            if (begin_offset >= self.maximum_offset) or (end_offset > self.maximum_offset):
                raise Exception(INVALID_OFFSETS.substitute(doc=self.groundtruth_manifest_file_name,
                                                           line_index=index,
                                                           begin_offset=begin_offset,
                                                           end_offset=end_offset,
                                                           line_size=self.maximum_offset))
            annotations.append((self.input_file_name, index, begin_offset, end_offset, label))
        
        self._check_for_overlapping_annotations(annotations)
           
        return source, annotations