def main()

in dpr_scale/utils/ccnews_stats.py [0:0]


def main(args, logger):
    """
    No hard negative sampling done. Only positive samples given a passage are prepared.
    TODO: One way would be to get other passages from the same article as negatives.
    """

    logger.info(args.__dict__)
    files = [
        os.path.join(dir_path, file_name)
        for (dir_path, dir_names, file_names) in os.walk(args.doc_dir)
        for file_name in file_names
    ]
    if args.debug:
        files = files[:2]

    workers = min(args.workers, len(files))
    logger.info(f"Number of workers = {workers}")
    (
        num_samples,
        url_dict,
        total_num_sents,
        total_num_words,
    ) = process_cc_news_files(files, workers, args.debug)
    logger.info(f"{num_samples} samples were found")
    logger.info(f"{len(url_dict)} URLs were found")
    logger.info(
        f"{total_num_sents/num_samples} is the avg number of sentences"
    )
    logger.info(f"{total_num_words/num_samples} is the avg number of words")