in dpr_scale/utils/ccnews_stats.py [0:0]
def main(args, logger):
"""
No hard negative sampling done. Only positive samples given a passage are prepared.
TODO: One way would be to get other passages from the same article as negatives.
"""
logger.info(args.__dict__)
files = [
os.path.join(dir_path, file_name)
for (dir_path, dir_names, file_names) in os.walk(args.doc_dir)
for file_name in file_names
]
if args.debug:
files = files[:2]
workers = min(args.workers, len(files))
logger.info(f"Number of workers = {workers}")
(
num_samples,
url_dict,
total_num_sents,
total_num_words,
) = process_cc_news_files(files, workers, args.debug)
logger.info(f"{num_samples} samples were found")
logger.info(f"{len(url_dict)} URLs were found")
logger.info(
f"{total_num_sents/num_samples} is the avg number of sentences"
)
logger.info(f"{total_num_words/num_samples} is the avg number of words")