def __call__()

in src/chug/doc/doc_processor.py [0:0]


    def __call__(self, sample):
        if 'json' in sample and isinstance(sample['json'], bytes):
            # decode json if present and in undecoded state
            sample['json'] = json.loads(sample['json'])

        if self.flatten_json and 'json' in sample:
            # flatten json into sample
            sample.update(sample.pop('json'))

        # FIXME separate decode & preprocess interfaces

        # decode page annotations / text
        page_anno, page_indices, num_anno_pages = self._decode_anno(sample)

        # decode page images
        page_images = []
        for ext in self.image_input_key:
            if ext in sample:
                if ext == 'pdf':
                    images, num_image_pages = self._decode_pdf_pages(
                        sample,
                        ext,
                        page_indices,
                        num_anno_pages,
                    )
                else:
                    images, num_image_pages = self._decode_image_pages(
                        sample,
                        ext,
                        page_indices,
                        num_anno_pages,
                    )
                page_images.extend(images)
                # process one document type per doc, should be ordered by priority
                break

        assert len(page_images), 'No page images present'

        if self.expand_pages and len(page_images) > 1:
            # expand pages and page annotations into multiple samples (return list of sample dicts)
            page_anno = self._expand_anno(page_anno, len(page_images))
            decoded = [{self.image_input_name: pi, **pa} for pi, pa in zip(page_images, page_anno)]
        else:
            if self.squeeze_pages and len(page_images) == 1:
                # squeeze page & annotation lists into singular items
                page_images = page_images[0]
                page_anno = self._squeeze_anno(page_anno)
            decoded = {self.image_input_name: page_images, **page_anno}

        return decoded