in seed/util/preprocess.py [0:0]
def split_text_using_tiktoken(texts, chunk_size, chunk_overlap, encoding_name="o200k_base"):
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
separator="\n\n",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
encoding_name=encoding_name
)
if isinstance(texts[0], langchain.schema.Document):
a = [text.page_content for text in texts]
else:
a = [text for text in texts]
joined_texts = '\n\n'.join(a)
texts_tiktoken = text_splitter.split_text(joined_texts)
return texts_tiktoken