pipeline/alignments/tokenizer.py (153 lines of code) (raw):
#!/usr/bin/env python3
"""
Tokenizes a text file with line separated sentences using Moses tokenizer.
Example:
python pipeline/alignments/tokenizer.py --input_path=data/datasets/news.2023.en.shuffled.deduped \
--output_path=data/datasets/news.2023.en.shuffled.deduped.tok-icu --lang=en --chunk_size=500000 --tokenizer=icu
Using C++ opus-fast-mosestokenizer sometimes requires specifying LD_LIBRARY_PATH before starting the Python process
see https://github.com/Helsinki-NLP/opus-fast-mosestokenizer/issues/6
export LD_LIBRARY_PATH=.../<you-python-env>/lib/python3.10/site-packages/mosestokenizer/lib
Using ICU tokenizer requires installing it with `apt-get install python3-icu`,
see more installation instructions here: https://pypi.org/project/PyICU/
Whitespaces are ignored by Moses based tokenizers and preserved and replaced with a special token "▁" by ICU tokenizer
which allows lossless reconstruction of the original text on detokenization.
"""
import argparse
import multiprocessing
from abc import ABC, abstractmethod
from enum import Enum
from typing import List
from tqdm import tqdm
from pipeline.common.logging import get_logger
logger = get_logger("tokenizer")
class TokenizerType(Enum):
fast_moses = "fast_moses"
sacre_moses = "sacre_moses"
icu = "icu"
class Tokenizer(ABC):
def __init__(self, lang: str):
self.lang = lang
@abstractmethod
def tokenize(self, text: str) -> List[str]:
pass
@abstractmethod
def detokenize(self, tokens: List[str]) -> str:
pass
class FastMosesTokenizer(Tokenizer):
"""
Uses Moses tokenizer https://github.com/Helsinki-NLP/opus-fast-mosestokenizer
"""
def __init__(self, lang):
super().__init__(lang)
from mosestokenizer import MosesTokenizer
try:
self.tokenizer = MosesTokenizer(lang)
except RuntimeError as err:
msg = str(err)
if "No known abbreviations for language" in msg:
# Fall-back to English if the language is not found
self.tokenizer = MosesTokenizer("en")
else:
raise err
def tokenize(self, text: str) -> List[str]:
return self.tokenizer.tokenize(text)
def detokenize(self, tokens: List[str]) -> str:
return self.tokenizer.detokenize(tokens)
class SacreMosesTokenizer(Tokenizer):
"""
Uses Moses tokenizer https://github.com/hplt-project/sacremoses
"""
def __init__(self, lang):
super().__init__(lang)
import sacremoses
self.tokenizer = sacremoses.MosesTokenizer(lang)
self.detokenizer = sacremoses.MosesDetokenizer(lang)
def tokenize(self, text: str) -> List[str]:
return self.tokenizer.tokenize(text)
def detokenize(self, tokens: List[str]) -> str:
return self.detokenizer.detokenize(tokens)
class IcuTokenizer(Tokenizer):
"""
Uses ICU based word segmenter https://pypi.org/project/PyICU/
Preserves whitespaces as tokens by replacing them with a special character "▁".
Allows lossless reconstruction of the original text on detokenization.
"""
# Same character is used by SentencePiece
SPACE_TOKEN = "▁"
def tokenize(self, text: str) -> List[str]:
from icu import BreakIterator, Locale
bi = BreakIterator.createWordInstance(Locale(self.lang))
bi.setText(text)
tokens = []
start = bi.first()
for end in bi:
token = text[start:end]
if (
token and token != "\n"
): # exclude empty tokens, but leave whitespaces and replace them with a special token
tokens.append(token.replace(" ", self.SPACE_TOKEN))
start = end
return tokens
def detokenize(self, tokens: List[str]) -> str:
return "".join(tokens).replace(self.SPACE_TOKEN, " ")
def _read_file_in_chunks(file_path, chunk_size):
with open(file_path, "r", encoding="utf-8") as file:
while True:
lines = file.readlines(chunk_size)
if not lines:
break
yield [line.rstrip() for line in lines]
def _tokenize_lines(params) -> List[str]:
lines, lang, tok_type = params
if tok_type == TokenizerType.fast_moses:
tokenizer = FastMosesTokenizer(lang)
elif tok_type == TokenizerType.sacre_moses:
tokenizer = SacreMosesTokenizer(lang)
elif tok_type == TokenizerType.icu:
tokenizer = IcuTokenizer(lang)
else:
raise ValueError(f"Unknown tokenizer type: {tok_type}")
tokenized = []
for line in lines:
tokens = tokenizer.tokenize(line)
tokenized.append(" ".join(tokens))
return tokenized
def tokenize(
input_path: str,
output_path: str,
lang: str,
tokenizer: TokenizerType,
sentences_per_chunk: int = 100000,
) -> None:
logger.info(f"Tokenizing {input_path} with Moses tokenizer")
with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
with open(output_path, "w") as output_file:
chunks = _read_file_in_chunks(input_path, chunk_size=sentences_per_chunk)
pbar = tqdm(mininterval=10)
# ~100K sentences per second on a single core
for tokenized_chunk in pool.imap(
_tokenize_lines,
((ch, lang, tokenizer) for ch in chunks),
):
output_file.write("\n".join(tokenized_chunk) + "\n")
pbar.update(len(tokenized_chunk))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__,
# Preserves whitespace in the help text.
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"--output_path",
metavar="OUTPUT_PATH",
type=str,
help="Output file",
)
parser.add_argument(
"--input_path",
metavar="INPUT_PATH",
type=str,
default=None,
help="Input file",
)
parser.add_argument(
"--lang",
metavar="LANG",
type=str,
default=None,
help="Language",
)
parser.add_argument(
"--chunk_size",
metavar="CHUNK_SIZE",
type=int,
default=None,
help="Number of lines to process per chunk",
)
parser.add_argument(
"--tokenizer",
metavar="TOKENIZER",
type=TokenizerType,
choices=TokenizerType,
default=TokenizerType.icu,
help="Tokenization method",
)
args = parser.parse_args()
tokenize(
input_path=args.input_path,
output_path=args.output_path,
lang=args.lang,
sentences_per_chunk=args.chunk_size,
tokenizer=args.tokenizer,
)