in src/databao_context_engine/plugins/unstructured_files_plugin.py [0:0]
def _chunk_file(self, file_content: str) -> list[FileChunk]:
words_list = re.split(r"\s+", file_content)
chunks = []
chunk_start_index = 0
number_of_words = len(words_list)
while chunk_start_index < number_of_words:
chunk_end_index = min(number_of_words, chunk_start_index + self.max_tokens)
chunks.append(
FileChunk(
chunk_index=chunk_start_index,
chunk_content=" ".join(words_list[chunk_start_index:chunk_end_index]),
)
)
chunk_start_index = (
(chunk_end_index - self.tokens_overlap) if chunk_end_index < number_of_words else number_of_words
)
return chunks