def findall()

in parlai/torchscript/modules.py [0:0]


    def findall(cls, text: str) -> List[str]:
        """
        Split tokens in a manner that replicates parlai.utils.bpe.Gpt2BpeHelper.
        """
        contraction_endings = ["s", "t", "re", "ve", "m", "ll", "d"]

        tokens: List[str] = []
        idx = 0
        num_passes = 0
        while idx < len(text):
            num_passes += 1
            if num_passes > 10000:
                return ["*** Infinite loop in ScriptableGpt2BpeHelper.findall()! ***"]
            if text[idx] == "'":
                # Capture contradiction suffixes
                captured_suffix = False
                for ending in contraction_endings:
                    if text[idx + 1 : idx + 1 + len(ending)] == ending:
                        tokens.append("'" + ending)
                        idx += 1 + len(ending)
                        captured_suffix = True
                        break
                if captured_suffix:
                    continue
            if not text[idx].isspace() or (
                text[idx] == " " and idx + 1 < len(text) and not text[idx + 1].isspace()
            ):
                # Capture runs of one type of character
                if text[idx] == " ":
                    last_matching_idx = idx + 1
                else:
                    last_matching_idx = idx
                if text[last_matching_idx].isalpha():
                    while (
                        last_matching_idx + 1 < len(text)
                        and text[last_matching_idx + 1].isalpha()
                    ):
                        last_matching_idx += 1
                elif text[last_matching_idx].isnumeric():
                    while (
                        last_matching_idx + 1 < len(text)
                        and text[last_matching_idx + 1].isnumeric()
                    ):
                        last_matching_idx += 1
                else:
                    while (
                        last_matching_idx + 1 < len(text)
                        and not text[last_matching_idx + 1].isspace()
                        and not text[last_matching_idx + 1].isalpha()
                        and not text[last_matching_idx + 1].isnumeric()
                    ):
                        last_matching_idx += 1
                tokens.append(text[idx : last_matching_idx + 1])
                idx = last_matching_idx + 1
                continue
            if idx + 1 < len(text) and text[idx + 1].isspace():
                # Capture runs of space characters up until just before the final one
                last_space_idx = idx + 1
                while (
                    last_space_idx + 1 < len(text)
                    and text[last_space_idx + 1].isspace()
                ):
                    last_space_idx += 1
                if last_space_idx + 1 == len(text):
                    # Include the last char, which is a space char
                    tokens.append(text[idx : last_space_idx + 1])
                    idx = last_space_idx + 1
                else:
                    tokens.append(text[idx:last_space_idx])
                    idx = last_space_idx
                continue
            if True:
                # Capture runs of space characters
                last_space_idx = idx
                while (
                    last_space_idx + 1 < len(text)
                    and text[last_space_idx + 1].isspace()
                ):
                    last_space_idx += 1
                tokens.append(text[idx : last_space_idx + 1])
                idx = last_space_idx + 1
        return tokens