in sagemaker/src/htr_dataset.py [0:0]
def _read_line_annotation(self, annotation_dict):
'''
Extract the relevant information from the annotation (dict) of the output.manifest.
Then convert the bb into lines
Parameter:
----------
annotation_dict: {}
line from the output.manifest
Return:
-------
line_annotations: [{[]}]
formatted information.
Note that bbs are converted from polygons to rectangles.
'''
page_annotation = self._read_page_annotation(annotation_dict)
line_annotation_dict = {}
for annotation in page_annotation['annotation']:
line_num = annotation['line_num']
if line_num not in line_annotation_dict:
line_annotation_dict[line_num] = []
line_annotation_dict[line_num].append(annotation)
line_annotations = []
# Sort annotation by line
for line_num in line_annotation_dict:
tmp = line_annotation_dict[line_num]
bb_list = [a['bb'] for a in tmp]
texts = [a['text'] for a in tmp]
bb_list, texts = self._sort_texts_on_bb_x(bb_list, texts)
bb = self._convert_bb_list_to_max_bb(bb_list)
line_annotations.append({
"filename": page_annotation["filename"],
"annotation": [{
"text": texts,
"line_bb": bb,
"bb": bb_list
}]
})
return line_annotations