in pipeline/alignments/align.py [0:0]
def main() -> None:
logger.info(f"Running with arguments: {sys.argv}")
parser = argparse.ArgumentParser(
description=__doc__,
# Preserves whitespace in the help text.
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument("--type", metavar="TYPE", type=str, help="Dataset type: mono or corpus")
parser.add_argument(
"--corpus_src",
metavar="CORPUS_SRC",
type=str,
help="Full path to the source sentences in a parallel dataset. Supports decompression using zstd. "
"For example `fetches/corpus.ru` or `fetches/corpus.ru.zst`",
)
parser.add_argument(
"--corpus_trg",
metavar="CORPUS_TRG",
type=str,
help="Full path to the target sentences in a parallel dataset. Supports decompression using zstd. "
"For example `fetches/corpus.en` or `fetches/corpus.en.zst`",
)
parser.add_argument(
"--output_path",
metavar="OUTPUT_PATH",
type=str,
help="A full path to the output alignments file. It will be compressed if the path ends with .zst. "
"For example artifacts/corpus.aln or artifacts/corpus.aln.zst",
)
parser.add_argument(
"--priors_input_path",
metavar="PRIORS_INPUT_PATH",
type=str,
default=None,
help="A full path to the model priors calculated in advance. This can speed up generation.",
)
parser.add_argument(
"--priors_output_path",
metavar="PRIORS_OUTPUT_PATH",
type=str,
default=None,
help="Calculate and save the model priors to the specified file path. "
"The file will be compressed if it ends with .zst",
)
parser.add_argument(
"--tokenization",
metavar="TOKENIZATION",
type=Tokenization,
choices=list(Tokenization),
default=Tokenization.spaces,
help="Use the specified tokenization method. Default is `spaces` which means no tokenization will be applied. "
"It remaps the alignments back to whitespace tokenized ones if another tokenization method is used.",
)
parser.add_argument(
"--output_tokenized",
metavar="OUTPUT_TOKENIZED",
type=bool,
default=False,
action=argparse.BooleanOptionalAction,
help="Output tokenized corpus and do not remap alignments to whitespace based tokenization",
)
parser.add_argument(
"--chunk_lines",
metavar="CHUNK_LINES",
type=int,
# use env to override from tests
default=int(os.getenv("ALN_CHUNK_LINES", "50000000")),
help="Split corpus to chunks of N lines to calculate alignments on them separately. "
"This helps with reducing the memory footprint. 100M by default.",
)
args = parser.parse_args()
logger.info("Starting generating alignments.")
run(
corpus_src=args.corpus_src,
corpus_trg=args.corpus_trg,
output_path=args.output_path,
tokenization=args.tokenization,
chunk_lines=args.chunk_lines,
output_tokenized=args.output_tokenized,
priors_input_path=args.priors_input_path,
priors_output_path=args.priors_output_path,
)
logger.info("Finished generating alignments.")