code/embedding-function/utilities/common/source_document.py (136 lines of code) (raw):
from typing import Optional, Type
import hashlib
import json
from urllib.parse import urlparse, quote
from ..helpers.azure_blob_storage_client import AzureBlobStorageClient
class SourceDocument:
def __init__(
self,
content: str,
source: str,
id: Optional[str] = None,
title: Optional[str] = None,
chunk: Optional[int] = None,
offset: Optional[int] = None,
page_number: Optional[int] = None,
chunk_id: Optional[str] = None,
sharepoint_file_id: Optional[str] = None,
):
self.id = id
self.content = content
self.source = source
self.title = title
self.chunk = chunk
self.offset = offset
self.page_number = page_number
self.chunk_id = chunk_id
self.sharepoint_file_id = sharepoint_file_id
def __str__(self):
return f"SourceDocument(id={self.id}, title={self.title}, source={self.source}, chunk={self.chunk}, offset={self.offset}, page_number={self.page_number}, chunk_id={self.chunk_id}, sharepoint_file_id={self.sharepoint_file_id})"
def __eq__(self, other):
if isinstance(self, other.__class__):
return (
self.id == other.id
and self.content == other.content
and self.source == other.source
and self.title == other.title
and self.chunk == other.chunk
and self.offset == other.offset
and self.page_number == other.page_number
and self.chunk_id == other.chunk_id
and self.sharepoint_file_id == other.sharepoint_file_id
)
return False
def to_json(self):
return json.dumps(self, cls=SourceDocumentEncoder)
@classmethod
def from_json(cls, json_string):
return json.loads(json_string, cls=SourceDocumentDecoder)
@classmethod
def from_dict(cls, dict_obj):
return cls(
dict_obj["id"],
dict_obj["content"],
dict_obj["source"],
dict_obj["title"],
dict_obj["chunk"],
dict_obj["offset"],
dict_obj["page_number"],
dict_obj["chunk_id"],
dict_obj.get("sharepoint_file_id"),
)
@classmethod
def from_metadata(
cls: Type["SourceDocument"],
content: str,
metadata: dict,
document_url: Optional[str],
idx: Optional[int],
) -> "SourceDocument":
parsed_url = urlparse(document_url)
file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
filename = parsed_url.path
hash_key = hashlib.sha1(f"{file_url}_{idx}".encode("utf-8")).hexdigest()
hash_key = f"doc_{hash_key}"
sas_placeholder = (
"_SAS_TOKEN_PLACEHOLDER_"
if parsed_url.netloc
and parsed_url.netloc.endswith(".blob.core.windows.net")
else ""
)
return cls(
id=metadata.get("id", hash_key),
content=content,
source=metadata.get("source", f"{file_url}{sas_placeholder}"),
title=metadata.get("title", filename),
chunk=metadata.get("chunk", idx),
offset=metadata.get("offset"),
page_number=metadata.get("page_number"),
chunk_id=metadata.get("chunk_id"),
sharepoint_file_id=metadata.get("sharepoint_file_id"),
)
def get_filename(self, include_path=False):
filename = self.source.replace("_SAS_TOKEN_PLACEHOLDER_", "").replace(
"http://", ""
)
if include_path:
filename = filename.split("/")[-1]
else:
filename = filename.split("/")[-1].split(".")[0]
return filename
def get_markdown_url(self):
url = quote(self.source, safe=":/")
if "_SAS_TOKEN_PLACEHOLDER_" in url:
blob_client = AzureBlobStorageClient()
container_sas = blob_client.get_container_sas()
url = url.replace("_SAS_TOKEN_PLACEHOLDER_", container_sas)
return f"[{self.title}]({url})"
class SourceDocumentEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, SourceDocument):
return {
"id": obj.id,
"content": obj.content,
"source": obj.source,
"title": obj.title,
"chunk": obj.chunk,
"offset": obj.offset,
"page_number": obj.page_number,
"chunk_id": obj.chunk_id,
"sharepoint_file_id": obj.sharepoint_file_id,
}
return super().default(obj)
class SourceDocumentDecoder(json.JSONDecoder):
def decode(self, s, **kwargs):
obj = super().decode(s, **kwargs)
return SourceDocument(
id=obj["id"],
content=obj["content"],
source=obj["source"],
title=obj["title"],
chunk=obj["chunk"],
offset=obj["offset"],
page_number=obj["page_number"],
chunk_id=obj["chunk_id"],
sharepoint_file_id=obj.get("sharepoint_file_id"),
)