code/embedding-function/utilities/document_loading/word_document.py (39 lines of code) (raw):
from typing import List
from io import BytesIO
from docx import Document
import requests
from .document_loading_base import DocumentLoadingBase
from ..common.source_document import SourceDocument
class WordDocumentLoading(DocumentLoadingBase):
def __init__(self) -> None:
super().__init__()
self.doc_headings_to_markdown_tags = {
"Heading 1": "h1",
"Heading 2": "h2",
"Heading 3": "h3",
"Heading 4": "h4",
"Heading 5": "h5",
"Heading 6": "h6",
}
def _download_document(self, document_url: str) -> BytesIO:
response = requests.get(document_url)
file = BytesIO(response.content)
return file
def _get_opening_tag(self, heading_level: int) -> str:
return f"<{self.doc_headings_to_markdown_tags.get(f'{heading_level}', 'p')}>"
def _get_closing_tag(self, heading_level: int) -> str:
return f"</{self.doc_headings_to_markdown_tags.get(f'{heading_level}', 'p')}>"
def load(self, document_url: str) -> List[SourceDocument]:
output = ""
document = Document(self._download_document(document_url))
for paragraph in document.paragraphs:
output += f"{self._get_opening_tag(paragraph.style.name)}{paragraph.text}{self._get_closing_tag(paragraph.style.name)}\n"
documents = [
SourceDocument(
content=output,
source=document_url,
offset=0,
page_number=0,
)
]
return documents