in chunking/chunkers/base_chunker.py [0:0]
def __init__(self, data):
"""
Initializes the BaseChunker with the provided data dictionary.
Parameters
----------
data : dict
A dictionary containing the following keys:
Required:
- "documentUrl" (str): The URL of the document.
- "documentContentType" (str): The MIME type of the document content.
- "documentBytes" (bytes): The binary content of the document.
Optional:
- "documentSasToken" (str): The SAS token for accessing the document. Can be an empty string
if not using storage account or key-based storage access.
- "documentContent" (str): The raw content of the document.
Attributes
----------
url : str
The document's URL.
sas_token : str
The SAS token for accessing the document. May be empty if not required.
file_url : str
The full URL constructed by concatenating `url` and `sas_token`.
filename : str
The name of the file extracted from the URL.
filepath : str
The path of the file extracted from the URL.
extension : str
The file extension extracted from the URL.
document_content : str
The raw content of the document.
document_bytes : bytes or None
The binary content of the document if provided; otherwise, `None`.
token_estimator : GptTokenEstimator
An instance for estimating token counts.
aoai_client : AzureOpenAIClient
An instance of the Azure OpenAI client initialized with the filename.
"""
self.data = data
self.url = data['documentUrl']
self.sas_token = data.get('documentSasToken', "")
self.file_url = f"{self.url}{self.sas_token}"
self.filename = data['fileName']
self.filepath = get_filepath_from_data(data)
self.extension = get_file_extension(self.filename)
document_content = data.get('documentContent')
self.document_content = document_content if document_content else ""
self.token_estimator = GptTokenEstimator()
self.aoai_client = AzureOpenAIClient(document_filename=self.filename)
document_bytes = data.get('documentBytes')
if document_bytes:
self.document_bytes = document_bytes
else:
self.document_bytes = None
logging.warning(f"[base_chunker][{self.filename}] Document bytes not provided.")
self.embeddings_vector_size = int(os.getenv("AZURE_EMBEDDINGS_VECTOR_SIZE", "3072"))