def extract_pdf_data()

in Autogen_v0.4/rag_agent/search_helper.py [0:0]


def extract_pdf_data(pdf_folder, extract_folder):
     doc_names = [os.listdir(pdf_folder)[i] for i in range(0, len(os.listdir(pdf_folder)))]
     for doc_idx, doc_name in enumerate(doc_names):
        # Get the document layout
        document_data = []
        print(f"Analyzing document: {doc_name}")
        result = get_document_layout(pdf_folder, doc_name)
        print(f"Layout analysis completed for document: {doc_name}")
        print(f"Processing document: {doc_name}...")
        for page in result.pages:
            if page.lines:
                page_text = ""
                for line_idx, line in enumerate(page.lines):
                    #print(f"Line {line_idx}: {line.content}")
                    page_text +=  line.content + " "

                doc_data = {
                    "doc_name": doc_name,
                    "page_number": page.page_number,
                    "line_number": line_idx,
                    "content": page_text
                }
                document_data.append(doc_data)

        output_file_path = os.path.join(extract_folder, doc_names[doc_idx] + "-document_data.json")
        with open(output_file_path, "w") as f:
            json.dump(document_data, f)