def split_text_using_tiktoken()

in seed/util/preprocess.py [0:0]


def split_text_using_tiktoken(texts, chunk_size, chunk_overlap, encoding_name="o200k_base"):
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        separator="\n\n", 
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        encoding_name=encoding_name
    )

    if isinstance(texts[0], langchain.schema.Document):
        a = [text.page_content for text in texts]
    else:
        a = [text for text in texts]
    joined_texts = '\n\n'.join(a)
    texts_tiktoken = text_splitter.split_text(joined_texts)
    
    return texts_tiktoken