in code/embedding-function/utilities/document_loading/web.py [0:0]
def load(self, document_url: str) -> List[SourceDocument]:
documents = WebBaseLoader(document_url).load()
for document in documents:
document.page_content = re.sub("\n{3,}", "\n\n", document.page_content)
# Remove half non-ascii character from start/end of doc content
pattern = re.compile(
r"[\x00-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]"
)
document.page_content = re.sub(pattern, "", document.page_content)
if document.page_content == "":
documents.remove(document)
source_documents: List[SourceDocument] = [
SourceDocument(
content=document.page_content,
source=document.metadata["source"],
)
for document in documents
]
return source_documents