in src/datatrove/tools/inspect_data.py [0:0]
def main():
""" """
args, extra_args = parser.parse_known_args()
kwargs = dict(extra_arg.split("=") for extra_arg in extra_args)
data_folder = get_datafolder(args.path)
label_folder = get_datafolder(args.label) if args.label else None
reader = reader_factory(data_folder, args.reader, **kwargs)
sampler = SamplerFilter(args.sample)
console.print(
f'Loading samples from "{data_folder.path}" with {reader} and sampling_rate={args.sample}.\n'
f"Samples are displayed full page one by one.\n"
f"If you don't see any color you may run \"export PAGER='less -r'\"."
)
filter_expr_text = None
if Confirm.ask(
"Would you like to add a filtering expression? (ex: x.metadata['token_count'] > 5000)", default=False
):
filter_expr_text = Confirm.get_input(console, "Type your filtering expression: ", password=False)
filter_expr = get_filter_expr(filter_expr_text)
good_samples = []
bad_samples = []
iterator = sampler(reader())
try:
for sample in iterator:
if not filter_expr(sample):
continue
with console.pager(styles=True):
console.print(
Panel(
f"[yellow]Data ID:[reset] {sample.id}\n"
f"[yellow]Metadata:[reset]\n"
+ "\n".join(f"- [blue]{field}: [reset] {value}" for field, value in sample.metadata.items())
)
)
console.print(sample.text)
if label_folder:
result = Prompt.ask(
"To label as good/bad example enter 'g'/'b'. Enter 'q' to skip labelling and move to the next "
"sample. Enter 'e' (exit) to leave:",
console=console,
choices=["g", "b", "e", "q"],
)
if result == "g":
good_samples.append(sample)
elif result == "b":
bad_samples.append(sample)
elif result == "e":
break
except Exception:
console.print_exception()
finally:
if good_samples and label_folder:
with JsonlWriter(label_folder, "good_samples.jsonl", compression=None) as writer:
for sample in good_samples:
writer.write(sample)
if bad_samples and label_folder:
with JsonlWriter(label_folder, "bad_samples.jsonl", compression=None) as writer:
for sample in bad_samples:
writer.write(sample)