in cc_net/mine.py [0:0]
def main(config: str = "base", **config_as_dict: Any) -> None:
# Use the given 'config' as default value.
config_base = config
if config_base in PREDEF_CONFIGS:
conf = PREDEF_CONFIGS[config_base]
elif Path(config_base).exists():
conf = Config.from_json(Path(config_base))
else:
raise ValueError(
f"Invalid value {config_base} for --config. "
f"Choose from ({', '.join(PREDEF_CONFIGS)}) or give an existing .json file."
)
conf = conf._replace(**{k: v for (k, v) in config_as_dict.items() if v is not None})
print(f"Will run cc_net.mine.main with the following config:", conf)
all_files = mine(conf)
if conf.will_split:
assert all_files
assert all(d.is_dir() for d in all_files)
all_dirs = all_files
if "split_by_lang" in conf.pipeline:
# Only try regrouping if we split the shards.
regroup(conf, all_dirs)
elif "split_by_segment" in conf.pipeline:
# If we split by segment then regrouping is trivial, since segments appear in only one shard.
move_segments(conf, all_dirs)
if conf.config_name == "test":
_validate_test(conf, conf.get_mined_dir(regroup=True))