in collection/download_commoncrawl_passages.py [0:0]
def sort_and_sample_filter_list(filter_list_path: Path) -> None:
"""Sort and sample URLs in a filter list."""
urls = []
with open(filter_list_path) as f:
for line in f:
urls.append(line.rstrip())
urls.sort()
with open(sampled_filter_lists_root / filter_list_path.name, 'w') as f:
for i, url in enumerate(urls):
if i % 100 == 0:
f.write(url + '\n')