build_obelics/02_parallel_extract_html_get_image_urls.py (113 lines of code) (raw):

import os import sys import numpy as np idx_machine = int(sys.argv[1]) IDX_DONE = set( [ 0, 1, 10, 105, 106, 107, 108, 109, 11, 110, 111, 112, 113, 12, 124, 125, 126, 127, 128, 129, 13, 130, 131, 132, 14, 143, 144, 145, 146, 147, 148, 149, 15, 150, 151, 16, 162, 163, 164, 165, 166, 167, 168, 169, 17, 170, 18, 181, 182, 183, 184, 185, 186, 187, 188, 189, 2, 29, 3, 30, 31, 32, 33, 34, 35, 36, 37, 4, 48, 49, 5, 50, 51, 52, 53, 54, 55, 56, 6, 67, 68, 69, 7, 70, 71, 72, 73, 74, 75, 8, 86, 87, 88, 89, 9, 90, 91, 92, 93, 94, ] ) IDX_REMAINING = [idx for idx in range(200) if idx not in IDX_DONE] NUM_MACHINES = 21 IDX = [el.tolist() for el in np.array_split(IDX_REMAINING, NUM_MACHINES)][idx_machine] for idx in IDX: os.system(f"python3 m4/sourcing/data_collection/callers/extract_html_get_image_urls.py {idx}")