in tiktoken_ext/openai_public.py [0:0]
def o200k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
)
special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018}
# This regex could be made more efficient. If I was the one working on this encoding, I would
# have done a few other things differently too, e.g. I think you can allocate tokens more
# efficiently across languages.
pat_str = "|".join(
[
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
r"""\p{N}{1,3}""",
r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
r"""\s*[\r\n]+""",
r"""\s+(?!\S)""",
r"""\s+""",
]
)
return {
"name": "o200k_base",
"pat_str": pat_str,
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}