in augly/text/augmenters/words_augmenter.py [0:0]
def split(self, data: str) -> str:
"""Augmenter that splits words in two"""
results = []
tokens = tokenize(data)
aug_word_cnt = self._generate_aug_cnt(
len(tokens), self.aug_min, self.aug_max, self.aug_p
)
filtered_word_idxes = self.pre_skip_aug(tokens)
aug_word_idxes = set(
get_aug_idxes(
self,
tokens,
filtered_word_idxes,
aug_word_cnt,
Method.WORD,
self.min_char,
)
)
if not aug_word_idxes:
return data
for t_i, token in enumerate(tokens):
if t_i not in aug_word_idxes:
results.append(token)
continue
target_token = tokens[t_i]
split_position = random.randint(1, len(target_token) - 1)
first_token = target_token[:split_position]
second_token = target_token[split_position:]
results.extend([first_token, second_token])
return detokenize(results)