in src/datatrove/utils/text.py [0:0]
def split_into_parts(text, mode="DOCUMENT", language=Languages.english):
from datatrove.utils.word_tokenizers import load_word_tokenizer
if mode == SPLIT_TEXT_DOCUMENTS:
return [text]
elif mode == SPLIT_TEXT_SENTENCES:
tokenizer = load_word_tokenizer(language)
spans = [b for _, b in tokenizer.span_tokenize(text)]
return [text[a:b] for a, b in zip([0] + spans[:-1], spans[:-1] + [len(text)])]
elif mode == SPLIT_TEXT_WORDS:
tokenizer = load_word_tokenizer(language)
return tokenizer.word_tokenize(text)
elif mode == SPLIT_TEXT_PARAGRAPHS:
# merge whitespace with prev line
og_lines = text.splitlines()
lines = []
next_line = []
for li, line in enumerate(og_lines):
if line.strip() and next_line:
lines.append("".join(next_line))
next_line = []
next_line.append(line)
if li != len(og_lines) - 1:
next_line.append("\n")
if next_line:
lines.append("".join(next_line))
return lines
else:
raise ValueError(f"Unknown {mode=}")