pipeline/alignments/tokenizer.py (153 lines of code) (raw):

#!/usr/bin/env python3 """ Tokenizes a text file with line separated sentences using Moses tokenizer. Example: python pipeline/alignments/tokenizer.py --input_path=data/datasets/news.2023.en.shuffled.deduped \ --output_path=data/datasets/news.2023.en.shuffled.deduped.tok-icu --lang=en --chunk_size=500000 --tokenizer=icu Using C++ opus-fast-mosestokenizer sometimes requires specifying LD_LIBRARY_PATH before starting the Python process see https://github.com/Helsinki-NLP/opus-fast-mosestokenizer/issues/6 export LD_LIBRARY_PATH=.../<you-python-env>/lib/python3.10/site-packages/mosestokenizer/lib Using ICU tokenizer requires installing it with `apt-get install python3-icu`, see more installation instructions here: https://pypi.org/project/PyICU/ Whitespaces are ignored by Moses based tokenizers and preserved and replaced with a special token "▁" by ICU tokenizer which allows lossless reconstruction of the original text on detokenization. """ import argparse import multiprocessing from abc import ABC, abstractmethod from enum import Enum from typing import List from tqdm import tqdm from pipeline.common.logging import get_logger logger = get_logger("tokenizer") class TokenizerType(Enum): fast_moses = "fast_moses" sacre_moses = "sacre_moses" icu = "icu" class Tokenizer(ABC): def __init__(self, lang: str): self.lang = lang @abstractmethod def tokenize(self, text: str) -> List[str]: pass @abstractmethod def detokenize(self, tokens: List[str]) -> str: pass class FastMosesTokenizer(Tokenizer): """ Uses Moses tokenizer https://github.com/Helsinki-NLP/opus-fast-mosestokenizer """ def __init__(self, lang): super().__init__(lang) from mosestokenizer import MosesTokenizer try: self.tokenizer = MosesTokenizer(lang) except RuntimeError as err: msg = str(err) if "No known abbreviations for language" in msg: # Fall-back to English if the language is not found self.tokenizer = MosesTokenizer("en") else: raise err def tokenize(self, text: str) -> List[str]: return self.tokenizer.tokenize(text) def detokenize(self, tokens: List[str]) -> str: return self.tokenizer.detokenize(tokens) class SacreMosesTokenizer(Tokenizer): """ Uses Moses tokenizer https://github.com/hplt-project/sacremoses """ def __init__(self, lang): super().__init__(lang) import sacremoses self.tokenizer = sacremoses.MosesTokenizer(lang) self.detokenizer = sacremoses.MosesDetokenizer(lang) def tokenize(self, text: str) -> List[str]: return self.tokenizer.tokenize(text) def detokenize(self, tokens: List[str]) -> str: return self.detokenizer.detokenize(tokens) class IcuTokenizer(Tokenizer): """ Uses ICU based word segmenter https://pypi.org/project/PyICU/ Preserves whitespaces as tokens by replacing them with a special character "▁". Allows lossless reconstruction of the original text on detokenization. """ # Same character is used by SentencePiece SPACE_TOKEN = "▁" def tokenize(self, text: str) -> List[str]: from icu import BreakIterator, Locale bi = BreakIterator.createWordInstance(Locale(self.lang)) bi.setText(text) tokens = [] start = bi.first() for end in bi: token = text[start:end] if ( token and token != "\n" ): # exclude empty tokens, but leave whitespaces and replace them with a special token tokens.append(token.replace(" ", self.SPACE_TOKEN)) start = end return tokens def detokenize(self, tokens: List[str]) -> str: return "".join(tokens).replace(self.SPACE_TOKEN, " ") def _read_file_in_chunks(file_path, chunk_size): with open(file_path, "r", encoding="utf-8") as file: while True: lines = file.readlines(chunk_size) if not lines: break yield [line.rstrip() for line in lines] def _tokenize_lines(params) -> List[str]: lines, lang, tok_type = params if tok_type == TokenizerType.fast_moses: tokenizer = FastMosesTokenizer(lang) elif tok_type == TokenizerType.sacre_moses: tokenizer = SacreMosesTokenizer(lang) elif tok_type == TokenizerType.icu: tokenizer = IcuTokenizer(lang) else: raise ValueError(f"Unknown tokenizer type: {tok_type}") tokenized = [] for line in lines: tokens = tokenizer.tokenize(line) tokenized.append(" ".join(tokens)) return tokenized def tokenize( input_path: str, output_path: str, lang: str, tokenizer: TokenizerType, sentences_per_chunk: int = 100000, ) -> None: logger.info(f"Tokenizing {input_path} with Moses tokenizer") with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool: with open(output_path, "w") as output_file: chunks = _read_file_in_chunks(input_path, chunk_size=sentences_per_chunk) pbar = tqdm(mininterval=10) # ~100K sentences per second on a single core for tokenized_chunk in pool.imap( _tokenize_lines, ((ch, lang, tokenizer) for ch in chunks), ): output_file.write("\n".join(tokenized_chunk) + "\n") pbar.update(len(tokenized_chunk)) if __name__ == "__main__": parser = argparse.ArgumentParser( description=__doc__, # Preserves whitespace in the help text. formatter_class=argparse.RawTextHelpFormatter, ) parser.add_argument( "--output_path", metavar="OUTPUT_PATH", type=str, help="Output file", ) parser.add_argument( "--input_path", metavar="INPUT_PATH", type=str, default=None, help="Input file", ) parser.add_argument( "--lang", metavar="LANG", type=str, default=None, help="Language", ) parser.add_argument( "--chunk_size", metavar="CHUNK_SIZE", type=int, default=None, help="Number of lines to process per chunk", ) parser.add_argument( "--tokenizer", metavar="TOKENIZER", type=TokenizerType, choices=TokenizerType, default=TokenizerType.icu, help="Tokenization method", ) args = parser.parse_args() tokenize( input_path=args.input_path, output_path=args.output_path, lang=args.lang, sentences_per_chunk=args.chunk_size, tokenizer=args.tokenizer, )