in ai-sample/TokenizeDocFunction/function_app.py [0:0]
def load_file(req):
# accepts user input as a json object, decodes and returns the document data.
loader_mapping = {
"PDF": PyPDFLoader,
"DOCUMENT": Docx2txtLoader,
"MARKUP": UnstructuredMarkdownLoader,
"TXT": TextLoader,
"PPTX": UnstructuredPowerPointLoader,
"HTML": UnstructuredHTMLLoader,
}
content = req["base64Content"]
file_bytes = base64.b64decode(content)
file = BytesIO(file_bytes)
fd, path = tempfile.mkstemp()
try:
with os.fdopen(fd, "wb") as f:
f.write(file.read())
document_type = req["documentType"].upper()
splitting_strategy = req["splittingStrategy"].upper()
if document_type in loader_mapping:
if (document_type == "MARKUP" and splitting_strategy == "MARKUP") or (
document_type == "HTML" and splitting_strategy == "HTML"
):
# return raw data for md and html splitters
return file_bytes.decode()
else:
loader_class = loader_mapping[document_type]
loader = loader_class(path)
else:
raise ValueError("File type not supported")
documents = loader.load()
# remove the source
for doc in documents:
doc.metadata.pop("source")
return documents
finally:
os.remove(path)