in Autogen_v0.4/rag_agent/search_helper.py [0:0]
def enrich_pdf_data(extracted_data_folder, output_file_name):
aml_index_data = []
system_message = """
You are an AI assitant who can extract title, topics and cateogries from a document.
You will be given a document and you need to extract the title, topics and categories from the document in json format.
Retain the language in the document while extracting the title, topics and categories.
Title: Extract the title of the document that captures the information in the document in the original document language.
Topics: Extract the topics from the document that best describe the content in the original document language.
Categories: Extract the categories from the document that best describe the content in the original document language.
Do not write ```json and ``` in your response.
json format:
{
"title": "Document Title"
"topics": ["topic1 in the do", "topic2"],
"categories": ["category1", "category2"]
}
"""
for ex_data in os.listdir(extracted_data_folder):
#print(f"Processing extracted data: {ex_data}")
with open(os.path.join(extracted_data_folder, ex_data), "r") as f:
aml_docs_json = json.loads( f.read())
print(f"Processing document: {f.name}")
for doc in aml_docs_json:
#print(f"Processing document: {doc['doc_name']}")
user_query = f"""Extract the Title, topics and categories from the document.
Document:
{doc["content"]}
"""
try:
llm_reponse =openai_helper.getOpenAIResp(user_query)
llm_json = json.loads(llm_reponse)
aml_index_item = {
"id": str(uuid.uuid4()),
"doc_name": doc["doc_name"],
"page_number": doc["page_number"],
"title": llm_json["title"],
"content": doc["content"],
"category": json.dumps(llm_json["categories"]),
"tags": json.dumps(llm_json["topics"]),
"lastupdated": str(datetime.now())
}
aml_index_data.append(aml_index_item)
except Exception as e:
with open("error.log", "a") as f:
f.write(f"Error processing document: {doc['doc_name']}, {doc['page_number']} - {e}\n")
print(f"Error processing document: {doc['doc_name']}, {doc['page_number']} - {e}")
with open(output_file_name, "w") as f:
json.dump(aml_index_data, f)