def main()

in pipeline/data/analyze.py [0:0]
66 lines of code
9 McCabe index (conditional complexity)

def main(args: Optional[list[str]] = None) -> None:
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawTextHelpFormatter,  # Preserves whitespace in the help text.
    )
    parser.add_argument(
        "--file_location", type=str, required=True, help="A url or file path for analyzing."
    )
    parser.add_argument(
        "--output_dir", type=str, required=True, help="The directory for the output."
    )
    parser.add_argument("--dataset", type=str, required=True, help="The name of the dataset")
    parser.add_argument(
        "--language",
        type=str,
        required=True,
        help="The dataset language, as a BCP-47 language tag",
    )
    # All the use of "--" to add more arguments.
    parser.add_argument("next_dataset_args", nargs=argparse.REMAINDER)

    parsed_args = parser.parse_args(args)

    # Defer parsing any options after "--", and recurse below if there are some.
    next_dataset_args: Optional[list[str]] = None
    if len(parsed_args.next_dataset_args):
        if parsed_args.next_dataset_args[0] != "--":
            print(next_dataset_args)
            raise Exception("Unexpected arguments. Use -- to pass in multiple datasets.")
        next_dataset_args = parsed_args.next_dataset_args[1:]

    logger.info(f"file_location: {parsed_args.file_location}")
    logger.info(f"output_dir: {parsed_args.output_dir}")
    logger.info(f"dataset: {parsed_args.dataset}")
    logger.info(f"language: {parsed_args.language}")

    dataset = Dataset(parsed_args.dataset)
    graph_prefix = f"{dataset.file_safe_name()}.{parsed_args.language}"

    # Compute the distributions for both the codepoints, and word size.
    codepoints_distribution = Histogram()
    word_distribution = Histogram()
    with get_line_streamer(parsed_args.file_location) as lines:
        for line in lines:
            codepoints_distribution.count(len(line))
            word_distribution.count(len(line.split()))

    plot_logarithmic_histogram(
        word_distribution,
        max_size=5_000,  # words
        title="\n".join(
            [
                "Word Count Distribution",
                f"{parsed_args.dataset} - {parsed_args.language}",
            ]
        ),
        x_axis_label="Words (log scale)",
        filename=os.path.join(parsed_args.output_dir, f"{graph_prefix}.distribution-words.png"),
    )

    plot_logarithmic_histogram(
        codepoints_distribution,
        max_size=10_000,  # codepoints
        title="\n".join(
            [
                "Codepoints per Sentence Distribution",
                f"{parsed_args.dataset} - {parsed_args.language}",
            ]
        ),
        x_axis_label="Codepoints (log scale)",
        filename=os.path.join(
            parsed_args.output_dir, f"{graph_prefix}.distribution-codepoints.png"
        ),
    )

    if next_dataset_args:
        # Apply the arguments again after "--".
        main(next_dataset_args)