def download_link()

in collection/download_wayback_passages.py [0:0]


def download_link(tup):
    link = tup[0]
    output_path = tup[1]
    num_workers = tup[2]
    page_id = str(uuid.uuid4())
    url_no_header = None

    try:
        # Find the Wayback Machine link
        if not wayback_prefix.match(link):
            link_encoded = urllib.parse.quote(link)

            available, availability_attempt = False, 0
            # Sometimes the API returns HTTP success code 200, but archived snapshots shows page is unavailable
            # when it actually is. Give it a total of three tries.
            while not available and availability_attempt < 3:
                response = download_with_retry(
                    f'http://archive.org/wayback/available?url={link_encoded}&timestamp=20191127'
                )
                json_response = response.json()
                available = 'closest' in json_response['archived_snapshots']
                availability_attempt += 1

            if not available:
                logging.warning(
                    f'Not available on Wayback Machine: {link}, HTTP code {response.status_code}, {json_response}'
                )
                return {'link': link, 'page_id': page_id, 'available': False}

            url = json_response['archived_snapshots']['closest']['url']
        else:
            url = link

        match = replace_pattern.search(url)
        assert match
        url_no_header = replace_pattern.sub(f'{match.group(1)}id_', url)

        response = download_with_retry(url_no_header)
        html_page = response.text
        parsed_text = extract_text(html_page)

        proc = multiprocessing.current_process()
        pid_mod = str(proc.pid % num_workers)

        (output_path / pid_mod).mkdir(parents=True, exist_ok=True)

        with open(output_path / pid_mod / page_id, 'w') as f:
            doc = {
                'id': url_no_header,
                'contents': parsed_text,
            }
            f.write(json.dumps(doc) + '\n')

        return {
            'link': link,
            'page_id': page_id,
            'available': True,
            'status_code': response.status_code,
            'wayback_url': url_no_header,
        }
    except HTTPError as http_err:
        logging.warning(f'HTTP error occurred: {http_err} for {link}')
        return {
            'link': link,
            'page_id': page_id,
            'available': False,
            'status_code': http_err.response.status_code if http_err.response else None,
            'wayback_url': url_no_header,
        }
    except UnicodeDecodeError as e:
        logging.warning(f'Unicode decode error occurred: {e} for {link}')
        return {
            'link': link,
            'page_id': page_id,
            'available': False,
            'status_code': response.status_code,
            'wayback_url': url_no_header,
        }
    except Exception as e:
        logging.warning(f'Exception occurred: {e} for {link}')
        return {
            'link': link,
            'page_id': page_id,
            'available': False,
            'status_code': None,
            'wayback_url': url_no_header,
        }