in collection/download_wayback_passages.py [0:0]
def download_link(tup):
link = tup[0]
output_path = tup[1]
num_workers = tup[2]
page_id = str(uuid.uuid4())
url_no_header = None
try:
# Find the Wayback Machine link
if not wayback_prefix.match(link):
link_encoded = urllib.parse.quote(link)
available, availability_attempt = False, 0
# Sometimes the API returns HTTP success code 200, but archived snapshots shows page is unavailable
# when it actually is. Give it a total of three tries.
while not available and availability_attempt < 3:
response = download_with_retry(
f'http://archive.org/wayback/available?url={link_encoded}×tamp=20191127'
)
json_response = response.json()
available = 'closest' in json_response['archived_snapshots']
availability_attempt += 1
if not available:
logging.warning(
f'Not available on Wayback Machine: {link}, HTTP code {response.status_code}, {json_response}'
)
return {'link': link, 'page_id': page_id, 'available': False}
url = json_response['archived_snapshots']['closest']['url']
else:
url = link
match = replace_pattern.search(url)
assert match
url_no_header = replace_pattern.sub(f'{match.group(1)}id_', url)
response = download_with_retry(url_no_header)
html_page = response.text
parsed_text = extract_text(html_page)
proc = multiprocessing.current_process()
pid_mod = str(proc.pid % num_workers)
(output_path / pid_mod).mkdir(parents=True, exist_ok=True)
with open(output_path / pid_mod / page_id, 'w') as f:
doc = {
'id': url_no_header,
'contents': parsed_text,
}
f.write(json.dumps(doc) + '\n')
return {
'link': link,
'page_id': page_id,
'available': True,
'status_code': response.status_code,
'wayback_url': url_no_header,
}
except HTTPError as http_err:
logging.warning(f'HTTP error occurred: {http_err} for {link}')
return {
'link': link,
'page_id': page_id,
'available': False,
'status_code': http_err.response.status_code if http_err.response else None,
'wayback_url': url_no_header,
}
except UnicodeDecodeError as e:
logging.warning(f'Unicode decode error occurred: {e} for {link}')
return {
'link': link,
'page_id': page_id,
'available': False,
'status_code': response.status_code,
'wayback_url': url_no_header,
}
except Exception as e:
logging.warning(f'Exception occurred: {e} for {link}')
return {
'link': link,
'page_id': page_id,
'available': False,
'status_code': None,
'wayback_url': url_no_header,
}