in chunking/chunker_factory.py [0:0]
def get_chunker(self, data):
"""
Get the appropriate chunker based on the file extension.
Args:
extension (str): The file extension.
data (dict): The data containing document information.
Returns:
BaseChunker: An instance of a chunker class.
"""
filename = get_filename_from_data(data)
logging.info(f"[chunker_factory][{filename}] Creating chunker")
extension = get_file_extension(filename)
if extension == 'vtt':
return TranscriptionChunker(data)
elif extension == 'json':
return JSONChunker(data)
elif extension in ('xlsx', 'xls'):
return SpreadsheetChunker(data)
elif extension in ('pdf', 'png', 'jpeg', 'jpg', 'bmp', 'tiff'):
if self.multimodality:
return MultimodalChunker(data)
else:
return DocAnalysisChunker(data)
elif extension in ('docx', 'pptx'):
if self.docint_40_api:
if self.multimodality:
return MultimodalChunker(data)
else:
return DocAnalysisChunker(data)
else:
logging.info(f"[chunker_factory][{filename}] Processing 'pptx' and 'docx' files requires Doc Intelligence 4.0.")
raise RuntimeError("Processing 'pptx' and 'docx' files requires Doc Intelligence 4.0.")
elif extension in ('nl2sql'):
return NL2SQLChunker(data)
else:
return LangChainChunker(data)