in tpipelinepagedimensions/textractpagedimensions/t_pagedimensions.py [0:0]
def add_page_dimensions(t_document: t2.TDocument, input_document: Union[str, bytes]) -> t2.TDocument:
"""
adds Page Dimensions to each page of the document in the form of a custom property on the Block
e. g. {'PageDimension': {'doc_width': 1549.0, 'doc_height': 370.0} }
"""
page_dimensions: List[DocumentDimensions] = list()
if isinstance(input_document, str):
if len(input_document) > 7 and input_document.lower().startswith("s3://"):
input_document = input_document.replace("s3://", "")
s3_bucket, s3_key = input_document.split("/", 1)
page_dimensions = get_width_height_from_s3_object(s3_bucket=s3_bucket, s3_key=s3_key)
else:
page_dimensions = get_width_height_from_file(filepath=input_document)
elif isinstance(input_document, (bytes, bytearray)):
page_dimensions = get_size_from_filestream(io.BytesIO(input_document), ext=None)
# bytes do not return a page for the Block, cannot use the mapping logic as above
if len(t_document.pages) != len(page_dimensions):
raise AssertionError(
f"number of pages in document did not match number of dimensions received: document-pages: {len(t_document.pages)}, dimension-pages: {len(page_dimensions)}"
)
for idx, block in enumerate(t_document.pages):
if block.custom:
if block.page:
block.custom['PageDimension'] = asdict(page_dimensions[block.page - 1])
else:
block.custom['PageDimension'] = asdict(page_dimensions[idx])
else:
if block.page:
block.custom = {'PageDimension': asdict(page_dimensions[block.page - 1])}
else:
block.custom = {'PageDimension': asdict(page_dimensions[idx])}
return t_document