in parlai/torchscript/modules.py [0:0]
def findall(cls, text: str) -> List[str]:
"""
Split tokens in a manner that replicates parlai.utils.bpe.Gpt2BpeHelper.
"""
contraction_endings = ["s", "t", "re", "ve", "m", "ll", "d"]
tokens: List[str] = []
idx = 0
num_passes = 0
while idx < len(text):
num_passes += 1
if num_passes > 10000:
return ["*** Infinite loop in ScriptableGpt2BpeHelper.findall()! ***"]
if text[idx] == "'":
# Capture contradiction suffixes
captured_suffix = False
for ending in contraction_endings:
if text[idx + 1 : idx + 1 + len(ending)] == ending:
tokens.append("'" + ending)
idx += 1 + len(ending)
captured_suffix = True
break
if captured_suffix:
continue
if not text[idx].isspace() or (
text[idx] == " " and idx + 1 < len(text) and not text[idx + 1].isspace()
):
# Capture runs of one type of character
if text[idx] == " ":
last_matching_idx = idx + 1
else:
last_matching_idx = idx
if text[last_matching_idx].isalpha():
while (
last_matching_idx + 1 < len(text)
and text[last_matching_idx + 1].isalpha()
):
last_matching_idx += 1
elif text[last_matching_idx].isnumeric():
while (
last_matching_idx + 1 < len(text)
and text[last_matching_idx + 1].isnumeric()
):
last_matching_idx += 1
else:
while (
last_matching_idx + 1 < len(text)
and not text[last_matching_idx + 1].isspace()
and not text[last_matching_idx + 1].isalpha()
and not text[last_matching_idx + 1].isnumeric()
):
last_matching_idx += 1
tokens.append(text[idx : last_matching_idx + 1])
idx = last_matching_idx + 1
continue
if idx + 1 < len(text) and text[idx + 1].isspace():
# Capture runs of space characters up until just before the final one
last_space_idx = idx + 1
while (
last_space_idx + 1 < len(text)
and text[last_space_idx + 1].isspace()
):
last_space_idx += 1
if last_space_idx + 1 == len(text):
# Include the last char, which is a space char
tokens.append(text[idx : last_space_idx + 1])
idx = last_space_idx + 1
else:
tokens.append(text[idx:last_space_idx])
idx = last_space_idx
continue
if True:
# Capture runs of space characters
last_space_idx = idx
while (
last_space_idx + 1 < len(text)
and text[last_space_idx + 1].isspace()
):
last_space_idx += 1
tokens.append(text[idx : last_space_idx + 1])
idx = last_space_idx + 1
return tokens