in Autogen_v0.4/rag_agent/search_helper.py [0:0]
def extract_pdf_data(pdf_folder, extract_folder):
doc_names = [os.listdir(pdf_folder)[i] for i in range(0, len(os.listdir(pdf_folder)))]
for doc_idx, doc_name in enumerate(doc_names):
# Get the document layout
document_data = []
print(f"Analyzing document: {doc_name}")
result = get_document_layout(pdf_folder, doc_name)
print(f"Layout analysis completed for document: {doc_name}")
print(f"Processing document: {doc_name}...")
for page in result.pages:
if page.lines:
page_text = ""
for line_idx, line in enumerate(page.lines):
#print(f"Line {line_idx}: {line.content}")
page_text += line.content + " "
doc_data = {
"doc_name": doc_name,
"page_number": page.page_number,
"line_number": line_idx,
"content": page_text
}
document_data.append(doc_data)
output_file_path = os.path.join(extract_folder, doc_names[doc_idx] + "-document_data.json")
with open(output_file_path, "w") as f:
json.dump(document_data, f)