in pipeline/data/analyze.py [0:0]
def main(args: Optional[list[str]] = None) -> None:
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawTextHelpFormatter, # Preserves whitespace in the help text.
)
parser.add_argument(
"--file_location", type=str, required=True, help="A url or file path for analyzing."
)
parser.add_argument(
"--output_dir", type=str, required=True, help="The directory for the output."
)
parser.add_argument("--dataset", type=str, required=True, help="The name of the dataset")
parser.add_argument(
"--language",
type=str,
required=True,
help="The dataset language, as a BCP-47 language tag",
)
# All the use of "--" to add more arguments.
parser.add_argument("next_dataset_args", nargs=argparse.REMAINDER)
parsed_args = parser.parse_args(args)
# Defer parsing any options after "--", and recurse below if there are some.
next_dataset_args: Optional[list[str]] = None
if len(parsed_args.next_dataset_args):
if parsed_args.next_dataset_args[0] != "--":
print(next_dataset_args)
raise Exception("Unexpected arguments. Use -- to pass in multiple datasets.")
next_dataset_args = parsed_args.next_dataset_args[1:]
logger.info(f"file_location: {parsed_args.file_location}")
logger.info(f"output_dir: {parsed_args.output_dir}")
logger.info(f"dataset: {parsed_args.dataset}")
logger.info(f"language: {parsed_args.language}")
dataset = Dataset(parsed_args.dataset)
graph_prefix = f"{dataset.file_safe_name()}.{parsed_args.language}"
# Compute the distributions for both the codepoints, and word size.
codepoints_distribution = Histogram()
word_distribution = Histogram()
with get_line_streamer(parsed_args.file_location) as lines:
for line in lines:
codepoints_distribution.count(len(line))
word_distribution.count(len(line.split()))
plot_logarithmic_histogram(
word_distribution,
max_size=5_000, # words
title="\n".join(
[
"Word Count Distribution",
f"{parsed_args.dataset} - {parsed_args.language}",
]
),
x_axis_label="Words (log scale)",
filename=os.path.join(parsed_args.output_dir, f"{graph_prefix}.distribution-words.png"),
)
plot_logarithmic_histogram(
codepoints_distribution,
max_size=10_000, # codepoints
title="\n".join(
[
"Codepoints per Sentence Distribution",
f"{parsed_args.dataset} - {parsed_args.language}",
]
),
x_axis_label="Codepoints (log scale)",
filename=os.path.join(
parsed_args.output_dir, f"{graph_prefix}.distribution-codepoints.png"
),
)
if next_dataset_args:
# Apply the arguments again after "--".
main(next_dataset_args)