build_obelics/02_parallel_extract_html_get_image_urls.py (113 lines of code) (raw):
import os
import sys
import numpy as np
idx_machine = int(sys.argv[1])
IDX_DONE = set(
[
0,
1,
10,
105,
106,
107,
108,
109,
11,
110,
111,
112,
113,
12,
124,
125,
126,
127,
128,
129,
13,
130,
131,
132,
14,
143,
144,
145,
146,
147,
148,
149,
15,
150,
151,
16,
162,
163,
164,
165,
166,
167,
168,
169,
17,
170,
18,
181,
182,
183,
184,
185,
186,
187,
188,
189,
2,
29,
3,
30,
31,
32,
33,
34,
35,
36,
37,
4,
48,
49,
5,
50,
51,
52,
53,
54,
55,
56,
6,
67,
68,
69,
7,
70,
71,
72,
73,
74,
75,
8,
86,
87,
88,
89,
9,
90,
91,
92,
93,
94,
]
)
IDX_REMAINING = [idx for idx in range(200) if idx not in IDX_DONE]
NUM_MACHINES = 21
IDX = [el.tolist() for el in np.array_split(IDX_REMAINING, NUM_MACHINES)][idx_machine]
for idx in IDX:
os.system(f"python3 m4/sourcing/data_collection/callers/extract_html_get_image_urls.py {idx}")