def main()

in pipeline/alignments/align.py [0:0]
81 lines of code
1 McCabe index (conditional complexity)

def main() -> None:
    logger.info(f"Running with arguments: {sys.argv}")
    parser = argparse.ArgumentParser(
        description=__doc__,
        # Preserves whitespace in the help text.
        formatter_class=argparse.RawTextHelpFormatter,
    )

    parser.add_argument("--type", metavar="TYPE", type=str, help="Dataset type: mono or corpus")
    parser.add_argument(
        "--corpus_src",
        metavar="CORPUS_SRC",
        type=str,
        help="Full path to the source sentences in a parallel dataset. Supports decompression using zstd. "
        "For example `fetches/corpus.ru` or `fetches/corpus.ru.zst`",
    )
    parser.add_argument(
        "--corpus_trg",
        metavar="CORPUS_TRG",
        type=str,
        help="Full path to the target sentences in a parallel dataset. Supports decompression using zstd. "
        "For example `fetches/corpus.en` or `fetches/corpus.en.zst`",
    )
    parser.add_argument(
        "--output_path",
        metavar="OUTPUT_PATH",
        type=str,
        help="A full path to the output alignments file. It will be compressed if the path ends with .zst. "
        "For example artifacts/corpus.aln or artifacts/corpus.aln.zst",
    )
    parser.add_argument(
        "--priors_input_path",
        metavar="PRIORS_INPUT_PATH",
        type=str,
        default=None,
        help="A full path to the model priors calculated in advance. This can speed up generation.",
    )
    parser.add_argument(
        "--priors_output_path",
        metavar="PRIORS_OUTPUT_PATH",
        type=str,
        default=None,
        help="Calculate and save the model priors to the specified file path. "
        "The file will be compressed if it ends with .zst",
    )
    parser.add_argument(
        "--tokenization",
        metavar="TOKENIZATION",
        type=Tokenization,
        choices=list(Tokenization),
        default=Tokenization.spaces,
        help="Use the specified tokenization method. Default is `spaces` which means no tokenization will be applied. "
        "It remaps the alignments back to whitespace tokenized ones if another tokenization method is used.",
    )
    parser.add_argument(
        "--output_tokenized",
        metavar="OUTPUT_TOKENIZED",
        type=bool,
        default=False,
        action=argparse.BooleanOptionalAction,
        help="Output tokenized corpus and do not remap alignments to whitespace based tokenization",
    )
    parser.add_argument(
        "--chunk_lines",
        metavar="CHUNK_LINES",
        type=int,
        # use env to override from tests
        default=int(os.getenv("ALN_CHUNK_LINES", "50000000")),
        help="Split corpus to chunks of N lines to calculate alignments on them separately. "
        "This helps with reducing the memory footprint. 100M by default.",
    )
    args = parser.parse_args()
    logger.info("Starting generating alignments.")
    run(
        corpus_src=args.corpus_src,
        corpus_trg=args.corpus_trg,
        output_path=args.output_path,
        tokenization=args.tokenization,
        chunk_lines=args.chunk_lines,
        output_tokenized=args.output_tokenized,
        priors_input_path=args.priors_input_path,
        priors_output_path=args.priors_output_path,
    )
    logger.info("Finished generating alignments.")