in augly/text/augmenters/typo.py [0:0]
def substitute(self, data: str) -> str:
"""
Returns text where random words are typos
@param data: the text where the word substitutions will occur
"""
results = []
tokens = self.tokenizer(data)
aug_word_cnt = self._generate_aug_cnt(
len(tokens), self.aug_min, self.aug_max, self.aug_p
)
filtered_word_idxes = self.skip_aug(self.pre_skip_aug(tokens), tokens)
aug_word_idxes = set(
get_aug_idxes(self, tokens, filtered_word_idxes, aug_word_cnt, Method.WORD)
)
for t_i in range(1, self.max_typo_length + 1):
i = 0
while i <= len(tokens) - t_i:
if i not in aug_word_idxes:
results.append(tokens[i])
i += 1
continue
misspellings = (
self.model.replace(" ".join(tokens[i : i + t_i]))
if self.model
else None
)
if misspellings:
misspelling = self.sample(misspellings, 1)[0]
results.append(self.align_capitalization(tokens[i], misspelling))
i += t_i - 1
elif len(self.augmenters) > 0:
aug = self.sample(self.augmenters, 1)[0]
new_token = aug.augment(tokens[i])
results.append(self.align_capitalization(tokens[i], new_token))
else:
# If no misspelling is found in the dict & no other typo types are being
# used, don't change the token
results.append(tokens[i])
i += 1
if t_i > 1:
results.extend(tokens[-t_i + 1 :])
return detokenize(results)