def _chunk_file()

in src/databao_context_engine/plugins/unstructured_files_plugin.py [0:0]


    def _chunk_file(self, file_content: str) -> list[FileChunk]:
        words_list = re.split(r"\s+", file_content)

        chunks = []

        chunk_start_index = 0
        number_of_words = len(words_list)
        while chunk_start_index < number_of_words:
            chunk_end_index = min(number_of_words, chunk_start_index + self.max_tokens)
            chunks.append(
                FileChunk(
                    chunk_index=chunk_start_index,
                    chunk_content=" ".join(words_list[chunk_start_index:chunk_end_index]),
                )
            )
            chunk_start_index = (
                (chunk_end_index - self.tokens_overlap) if chunk_end_index < number_of_words else number_of_words
            )

        return chunks