def get_chunker()

in chunking/chunker_factory.py [0:0]


    def get_chunker(self, data):
        """
        Get the appropriate chunker based on the file extension.

        Args:
            extension (str): The file extension.
            data (dict): The data containing document information.

        Returns:
            BaseChunker: An instance of a chunker class.
        """
        filename = get_filename_from_data(data)
        logging.info(f"[chunker_factory][{filename}] Creating chunker")

        extension = get_file_extension(filename)
        if extension == 'vtt':
            return TranscriptionChunker(data)
        elif extension == 'json':
            return JSONChunker(data)  
        elif extension in ('xlsx', 'xls'):
            return SpreadsheetChunker(data)
        elif extension in ('pdf', 'png', 'jpeg', 'jpg', 'bmp', 'tiff'):
            if self.multimodality:
                return MultimodalChunker(data)
            else:
                return DocAnalysisChunker(data)
        elif extension in ('docx', 'pptx'):
            if self.docint_40_api:
                if self.multimodality:
                    return MultimodalChunker(data)
                else:
                    return DocAnalysisChunker(data)
            else:
                logging.info(f"[chunker_factory][{filename}] Processing 'pptx' and 'docx' files requires Doc Intelligence 4.0.")
                raise RuntimeError("Processing 'pptx' and 'docx' files requires Doc Intelligence 4.0.")
        elif extension in ('nl2sql'):
            return NL2SQLChunker(data)
        else:
            return LangChainChunker(data)