def load_file()

in LogicApps-AI-RAG-Demo/TokenizeDocFunction/function_app.py [0:0]


def load_file(req):
    # accepts user input as a json object, decodes and returns the document data.
    loader_mapping = {
        "PDF": PyPDFLoader,
        "DOCUMENT": Docx2txtLoader,
        "MARKUP": UnstructuredMarkdownLoader,
        "TXT": TextLoader,
        "PPTX": UnstructuredPowerPointLoader,
        "HTML": UnstructuredHTMLLoader,
    }

    content = req["base64Content"]
    file_bytes = base64.b64decode(content)
    file = BytesIO(file_bytes)

    fd, path = tempfile.mkstemp()

    try:
        with os.fdopen(fd, "wb") as f:
            f.write(file.read())

        document_type = req["documentType"].upper()
        splitting_strategy = req["splittingStrategy"].upper()
        if document_type in loader_mapping:
            if (document_type == "MARKUP" and splitting_strategy == "MARKUP") or (
                document_type == "HTML" and splitting_strategy == "HTML"
            ):
                # return raw data for md and html splitters
                return file_bytes.decode()
            else:
                loader_class = loader_mapping[document_type]
                loader = loader_class(path)
        else:
            raise ValueError("File type not supported")

        documents = loader.load()

        # remove the source
        for doc in documents:
            doc.metadata.pop("source")

        return documents
    finally:
        os.remove(path)