in src/chug/doc/doc_processor.py [0:0]
def __call__(self, sample):
if 'json' in sample and isinstance(sample['json'], bytes):
# decode json if present and in undecoded state
sample['json'] = json.loads(sample['json'])
if self.flatten_json and 'json' in sample:
# flatten json into sample
sample.update(sample.pop('json'))
# FIXME separate decode & preprocess interfaces
# decode page annotations / text
page_anno, page_indices, num_anno_pages = self._decode_anno(sample)
# decode page images
page_images = []
for ext in self.image_input_key:
if ext in sample:
if ext == 'pdf':
images, num_image_pages = self._decode_pdf_pages(
sample,
ext,
page_indices,
num_anno_pages,
)
else:
images, num_image_pages = self._decode_image_pages(
sample,
ext,
page_indices,
num_anno_pages,
)
page_images.extend(images)
# process one document type per doc, should be ordered by priority
break
assert len(page_images), 'No page images present'
if self.expand_pages and len(page_images) > 1:
# expand pages and page annotations into multiple samples (return list of sample dicts)
page_anno = self._expand_anno(page_anno, len(page_images))
decoded = [{self.image_input_name: pi, **pa} for pi, pa in zip(page_images, page_anno)]
else:
if self.squeeze_pages and len(page_images) == 1:
# squeeze page & annotation lists into singular items
page_images = page_images[0]
page_anno = self._squeeze_anno(page_anno)
decoded = {self.image_input_name: page_images, **page_anno}
return decoded