code/embedding-function/utilities/common/source

from typing import Optional, Type import hashlib import json from urllib.parse import urlparse, quote from ..helpers.azure_blob_storage_client import AzureBlobStorageClient class SourceDocument: def __init__( self, content: str, source: str, id: Optional[str] = None, title: Optional[str] = None, chunk: Optional[int] = None, offset: Optional[int] = None, page_number: Optional[int] = None, chunk_id: Optional[str] = None, sharepoint_file_id: Optional[str] = None, ): self.id = id self.content = content self.source = source self.title = title self.chunk = chunk self.offset = offset self.page_number = page_number self.chunk_id = chunk_id self.sharepoint_file_id = sharepoint_file_id def __str__(self): return f"SourceDocument(id={self.id}, title={self.title}, source={self.source}, chunk={self.chunk}, offset={self.offset}, page_number={self.page_number}, chunk_id={self.chunk_id}, sharepoint_file_id={self.sharepoint_file_id})" def __eq__(self, other): if isinstance(self, other.__class__): return ( self.id == other.id and self.content == other.content and self.source == other.source and self.title == other.title and self.chunk == other.chunk and self.offset == other.offset and self.page_number == other.page_number and self.chunk_id == other.chunk_id and self.sharepoint_file_id == other.sharepoint_file_id ) return False def to_json(self): return json.dumps(self, cls=SourceDocumentEncoder) @classmethod def from_json(cls, json_string): return json.loads(json_string, cls=SourceDocumentDecoder) @classmethod def from_dict(cls, dict_obj): return cls( dict_obj["id"], dict_obj["content"], dict_obj["source"], dict_obj["title"], dict_obj["chunk"], dict_obj["offset"], dict_obj["page_number"], dict_obj["chunk_id"], dict_obj.get("sharepoint_file_id"), ) @classmethod def from_metadata( cls: Type["SourceDocument"], content: str, metadata: dict, document_url: Optional[str], idx: Optional[int], ) -> "SourceDocument": parsed_url = urlparse(document_url) file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path filename = parsed_url.path hash_key = hashlib.sha1(f"{file_url}_{idx}".encode("utf-8")).hexdigest() hash_key = f"doc_{hash_key}" sas_placeholder = ( "_SAS_TOKEN_PLACEHOLDER_" if parsed_url.netloc and parsed_url.netloc.endswith(".blob.core.windows.net") else "" ) return cls( id=metadata.get("id", hash_key), content=content, source=metadata.get("source", f"{file_url}{sas_placeholder}"), title=metadata.get("title", filename), chunk=metadata.get("chunk", idx), offset=metadata.get("offset"), page_number=metadata.get("page_number"), chunk_id=metadata.get("chunk_id"), sharepoint_file_id=metadata.get("sharepoint_file_id"), ) def get_filename(self, include_path=False): filename = self.source.replace("_SAS_TOKEN_PLACEHOLDER_", "").replace( "http://", "" ) if include_path: filename = filename.split("/")[-1] else: filename = filename.split("/")[-1].split(".")[0] return filename def get_markdown_url(self): url = quote(self.source, safe=":/") if "_SAS_TOKEN_PLACEHOLDER_" in url: blob_client = AzureBlobStorageClient() container_sas = blob_client.get_container_sas() url = url.replace("_SAS_TOKEN_PLACEHOLDER_", container_sas) return f"[{self.title}]({url})" class SourceDocumentEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, SourceDocument): return { "id": obj.id, "content": obj.content, "source": obj.source, "title": obj.title, "chunk": obj.chunk, "offset": obj.offset, "page_number": obj.page_number, "chunk_id": obj.chunk_id, "sharepoint_file_id": obj.sharepoint_file_id, } return super().default(obj) class SourceDocumentDecoder(json.JSONDecoder): def decode(self, s, **kwargs): obj = super().decode(s, **kwargs) return SourceDocument( id=obj["id"], content=obj["content"], source=obj["source"], title=obj["title"], chunk=obj["chunk"], offset=obj["offset"], page_number=obj["page_number"], chunk_id=obj["chunk_id"], sharepoint_file_id=obj.get("sharepoint_file_id"), )

code/embedding-function/utilities/common/source_document.py (136 lines of code) (raw):