build_obelics/06_02_merge_sets_image_urls_in_webdocs.py (48 lines of code) (raw):
# srun --pty --ntasks=1 --cpus-per-task=96 bash -i
# conda activate /fsx/m4/conda/shared-m4-2023-03-10
import os
import pickle
from collections import Counter
from tqdm import tqdm
PATH_S3_IMAGE_URLS_IN_WEBDOCS = "s3://m4-datasets/webdocs/image_urls_in_web_document_dataset_filtered/"
NUM_SHARDS = 200
THRESHOLD_TOO_DUPLICATED = 10
if __name__ == "__main__":
path_save_disk_image_urls_in_webdocs = "/scratch/image_urls_in_webdocs"
command_sync_s3 = f"aws s3 sync {PATH_S3_IMAGE_URLS_IN_WEBDOCS} {path_save_disk_image_urls_in_webdocs}"
os.system(command_sync_s3)
os.system(command_sync_s3)
os.system(command_sync_s3)
all_counters = []
for idx_shard in tqdm(range(NUM_SHARDS)):
with open(
os.path.join(
path_save_disk_image_urls_in_webdocs,
str(idx_shard),
"image_urls_in_web_document_dataset_filtered.pickle",
),
"rb",
) as f:
all_counters.append(pickle.load(f))
tot_counter = Counter()
for counter in tqdm(all_counters):
tot_counter.update(counter)
with open("/scratch/tot_image_urls_in_web_document_dataset_filtered.pickle", "wb") as f:
pickle.dump(tot_counter, f, pickle.HIGHEST_PROTOCOL)
command_sync_s3 = (
"aws s3 cp /scratch/tot_image_urls_in_web_document_dataset_filtered.pickle"
" s3://m4-datasets/webdocs/tot_image_urls_in_web_document_dataset_filtered.pickle"
)
os.system(command_sync_s3)
os.system(command_sync_s3)
os.system(command_sync_s3)
tot_image_urls_in_web_document_dataset_filtered_too_duplicated = [
k for k, v in tot_counter.items() if v > THRESHOLD_TOO_DUPLICATED
]
with open("/scratch/tot_image_urls_in_web_document_dataset_filtered_too_duplicated.pickle", "wb") as f:
pickle.dump(tot_counter, f, pickle.HIGHEST_PROTOCOL)
command_sync_s3 = (
"aws s3 cp /scratch/tot_image_urls_in_web_document_dataset_filtered_too_duplicated.pickle"
" s3://m4-datasets/webdocs/tot_image_urls_in_web_document_dataset_filtered_too_duplicated.pickle"
)
os.system(command_sync_s3)
os.system(command_sync_s3)
os.system(command_sync_s3)