in src/chug/doc/doc_read_processor.py [0:0]
def _decode_anno(self, sample):
anno = _get_value(self.text_input_key, sample)
assert anno is not None, f"No annotation found with keys ({self.text_input_key})."
try:
page_anno = self._process_anno_pages(anno)
except Exception as exn:
_logger.error(f'Issue processing annotation for {sample["__url__"]}, {sample["__key__"]}.')
#_logger.error(json.dumps(anno, indent=4))
raise exn
# extract info from the _parse
info = page_anno.get('_parse', {})
page_indices = info.get('page_indices', [0]) # the samples page indices
num_anno_pages = info.get('num_pages', 1)
# TODO support 'image info' to relay details such as text bbox, layout
# page_image_info = info.get('image_info', None)
# if page_image_info is not None:
# assert len(page_image_info) == len(page_indices)
return page_anno, page_indices, num_anno_pages