in sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/crack_and_chunk_with_doc_intel/document_intelligence_loader.py [0:0]
def extract_pdf_content(file, form_recognizer_client, use_layout=False):
offset = 0
page_map = []
model = "prebuilt-layout" if use_layout else "prebuilt-read"
poller = form_recognizer_client.begin_analyze_document(model, document=file)
form_recognizer_results = poller.result()
# (if using layout) mark all the positions of headers
roles_start = {}
roles_end = {}
for paragraph in form_recognizer_results.paragraphs:
if paragraph.role is not None:
para_start = paragraph.spans[0].offset
para_end = paragraph.spans[0].offset + paragraph.spans[0].length
roles_start[para_start] = paragraph.role
roles_end[para_end] = paragraph.role
for page_num, page in enumerate(form_recognizer_results.pages):
tables_on_page = [
table
for table in form_recognizer_results.tables
if table.bounding_regions[0].page_number == page_num + 1
]
# (if using layout) mark all positions of the table spans in the page
page_offset = page.spans[0].offset
page_length = page.spans[0].length
table_chars = [-1] * page_length
for table_id, table in enumerate(tables_on_page):
for span in table.spans:
# replace all table spans with "table_id" in table_chars array
for i in range(span.length):
idx = span.offset - page_offset + i
if idx >= 0 and idx < page_length:
table_chars[idx] = table_id
# build page text by replacing charcters in table spans with table html and replace the characters corresponding to headers with html headers, if using layout
page_text = ""
added_tables = set()
for idx, table_id in enumerate(table_chars):
if table_id == -1:
position = page_offset + idx
if position in roles_start:
role = roles_start[position]
if role in PDF_HEADERS:
page_text += f"<{PDF_HEADERS[role]}>"
if position in roles_end:
role = roles_end[position]
if role in PDF_HEADERS:
page_text += f"</{PDF_HEADERS[role]}>"
page_text += form_recognizer_results.content[page_offset + idx]
elif table_id not in added_tables:
page_text += table_to_html(tables_on_page[table_id])
added_tables.add(table_id)
page_text += " "
page_map.append((page_num, offset, page_text))
offset += len(page_text)
# full_text = "".join([page_text for _, _, page_text in page_map])
return page_map