def main()

in src/datatrove/tools/inspect_data.py [0:0]


def main():
    """ """
    args, extra_args = parser.parse_known_args()
    kwargs = dict(extra_arg.split("=") for extra_arg in extra_args)
    data_folder = get_datafolder(args.path)
    label_folder = get_datafolder(args.label) if args.label else None

    reader = reader_factory(data_folder, args.reader, **kwargs)

    sampler = SamplerFilter(args.sample)

    console.print(
        f'Loading samples from "{data_folder.path}" with {reader} and sampling_rate={args.sample}.\n'
        f"Samples are displayed full page one by one.\n"
        f"If you don't see any color you may run \"export PAGER='less -r'\"."
    )

    filter_expr_text = None
    if Confirm.ask(
        "Would you like to add a filtering expression? (ex: x.metadata['token_count'] > 5000)", default=False
    ):
        filter_expr_text = Confirm.get_input(console, "Type your filtering expression: ", password=False)
    filter_expr = get_filter_expr(filter_expr_text)

    good_samples = []
    bad_samples = []
    iterator = sampler(reader())
    try:
        for sample in iterator:
            if not filter_expr(sample):
                continue
            with console.pager(styles=True):
                console.print(
                    Panel(
                        f"[yellow]Data ID:[reset] {sample.id}\n"
                        f"[yellow]Metadata:[reset]\n"
                        + "\n".join(f"- [blue]{field}: [reset] {value}" for field, value in sample.metadata.items())
                    )
                )
                console.print(sample.text)
            if label_folder:
                result = Prompt.ask(
                    "To label as good/bad example enter 'g'/'b'. Enter 'q' to skip labelling and move to the next "
                    "sample. Enter 'e' (exit) to leave:",
                    console=console,
                    choices=["g", "b", "e", "q"],
                )
                if result == "g":
                    good_samples.append(sample)
                elif result == "b":
                    bad_samples.append(sample)
                elif result == "e":
                    break
    except Exception:
        console.print_exception()
    finally:
        if good_samples and label_folder:
            with JsonlWriter(label_folder, "good_samples.jsonl", compression=None) as writer:
                for sample in good_samples:
                    writer.write(sample)
        if bad_samples and label_folder:
            with JsonlWriter(label_folder, "bad_samples.jsonl", compression=None) as writer:
                for sample in bad_samples:
                    writer.write(sample)