def __init__()

in chunking/chunkers/base_chunker.py [0:0]


    def __init__(self, data):
        """
        Initializes the BaseChunker with the provided data dictionary.

        Parameters
        ----------
        data : dict
            A dictionary containing the following keys:

            Required:
                - "documentUrl" (str): The URL of the document.
                - "documentContentType" (str): The MIME type of the document content.
                - "documentBytes" (bytes): The binary content of the document.

            Optional:
                - "documentSasToken" (str): The SAS token for accessing the document. Can be an empty string
                  if not using storage account or key-based storage access.
                - "documentContent" (str): The raw content of the document.
        
        Attributes
        ----------
        url : str
            The document's URL.
        sas_token : str
            The SAS token for accessing the document. May be empty if not required.
        file_url : str
            The full URL constructed by concatenating `url` and `sas_token`.
        filename : str
            The name of the file extracted from the URL.
        filepath : str
            The path of the file extracted from the URL.            
        extension : str
            The file extension extracted from the URL.
        document_content : str
            The raw content of the document.
        document_bytes : bytes or None
            The binary content of the document if provided; otherwise, `None`.
        token_estimator : GptTokenEstimator
            An instance for estimating token counts.
        aoai_client : AzureOpenAIClient
            An instance of the Azure OpenAI client initialized with the filename.
        """
        self.data = data
        self.url = data['documentUrl']
        self.sas_token = data.get('documentSasToken', "")
        self.file_url = f"{self.url}{self.sas_token}"
        self.filename = data['fileName']
        self.filepath = get_filepath_from_data(data)
        self.extension = get_file_extension(self.filename)
        document_content = data.get('documentContent') 
        self.document_content = document_content if document_content else ""
        self.token_estimator = GptTokenEstimator()
        self.aoai_client = AzureOpenAIClient(document_filename=self.filename)
        document_bytes = data.get('documentBytes') 
        if document_bytes:
            self.document_bytes = document_bytes 
        else:
            self.document_bytes = None
            logging.warning(f"[base_chunker][{self.filename}] Document bytes not provided.")
        self.embeddings_vector_size = int(os.getenv("AZURE_EMBEDDINGS_VECTOR_SIZE", "3072"))