scripts/per_dataset_script/add_gender_to_libritts_r.py (32 lines of code) (raw):

from datasets import load_dataset from multiprocess import set_start_method import pandas as pd import argparse if __name__ == "__main__": set_start_method("spawn") parser = argparse.ArgumentParser() parser.add_argument("dataset_name", type=str, help="Repo id or local path.") parser.add_argument("tsv_path", default=None, type=str, help="Text column name.") parser.add_argument("--configuration", default=None, type=str, help="Dataset configuration to use.") parser.add_argument("--output_dir", default=None, type=str, help="If specified, save the dasaset on disk.") parser.add_argument("--repo_id", default=None, type=str, help="If specified, push the model to the hub.") parser.add_argument("--speaker_id_column_name", default="speaker_id", type=str, help="Audio column name.") parser.add_argument("--cpu_num_workers", default=1, type=int, help="Number of CPU workers for transformations that don't use GPUs or if no GPU are available.") args = parser.parse_args() if args.configuration: dataset = load_dataset(args.dataset_name, args.configuration) else: dataset = load_dataset(args.dataset_name) speaker_id_column_name = args.speaker_id_column_name speaker_dataset = pd.read_csv(args.tsv_path, sep="\t").to_dict() def map_gender(speaker_ids): genders = [speaker_dataset["READER"][int(speaker)] for speaker in speaker_ids] return {"gender": ["male" if g=="M" else "female" for g in genders]} dataset = dataset.map(map_gender, batched=True, batch_size=128, input_columns=speaker_id_column_name, num_proc=args.cpu_num_workers) if args.output_dir: dataset.save_to_disk(args.output_dir) if args.repo_id: if args.configuration: dataset.push_to_hub(args.repo_id, args.configuration) else: dataset.push_to_hub(args.repo_id)