make_manifest.py (199 lines of code) (raw):

import json import logging import re import zipfile import csv from io import BytesIO, StringIO from urllib.parse import urljoin import click import requests from PIL import Image from robobrowser import RoboBrowser from nsfw import is_nsfw LINK_SELECTOR = 'link[rel=apple-touch-icon], link[rel=apple-touch-icon-precomposed], link[rel="icon shortcut"], link[rel="shortcut icon"], link[rel="icon"], link[rel="SHORTCUT ICON"], link[rel="fluid-icon"]' META_SELECTOR = 'meta[name=apple-touch-icon]' FIREFOX_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0' IPHONE_UA = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Version/10.0 Mobile/14D27 Safari/602.1' ALEXA_DATA_URL = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip' SVG_ICON_WIDTH = "SVG_ICON_WIDTH" # Domains we want to exclude DOMAIN_EXCLUSION_LIST = [ "higheurest.com", "blogspot.co.id", "pipeschannels.com", "blogspot.mx", "bestadbid.com", "googlevideo.com", "tqeobp89axcn.com", "ioredi.com", "moradu.com", "fedsit.com", "vebadu.com" ] logging.basicConfig(filename='debug.log',level=logging.INFO) def _fetch_alexa_top_sites(): r = requests.get(ALEXA_DATA_URL, timeout=60) z = zipfile.ZipFile(BytesIO(r.content)) rows = StringIO(z.read('top-1m.csv').decode('UTF-8')) for row in rows: rank, domain = row.split(',') yield (int(rank), domain.strip()) def _fetch_top_sites(topsitesfile): with open(topsitesfile, newline='') as csvfile: rows = csv.reader(csvfile) for row in rows: if len(row) == 0: # skip empty lines continue yield (row[0], row[1]) def top_sites(topsitesfile, count): logging.info(f'Fetching top {count} sites') top_sites_generator = None if topsitesfile: top_sites_generator = _fetch_top_sites(topsitesfile) else: top_sites_generator = _fetch_alexa_top_sites() return [next(top_sites_generator) for x in range(count)] def extra_sites(extrafile): logging.info(f'Fetching extra sites') extra_sites_generator = None extra_sites_generator = _fetch_top_sites(extrafile) return list(extra_sites_generator) def is_url_reachable(url): try: response = requests.get(url, headers={'User-agent': FIREFOX_UA}, timeout=60) return True if response.status_code == 200 else False except Exception as e: logging.info(f'Exception: "{str(e)}" while checking if "{url}" is reachable or not') return False def fetch_icons(url, user_agent=IPHONE_UA): logging.info(f'Fetching icons for {url}') icons = [] browser = RoboBrowser(user_agent=user_agent, parser='html.parser') try: browser.open(url, timeout=60) for link in browser.select(LINK_SELECTOR): icon = link.attrs icon_url = icon['href'] if icon_url.startswith('data:'): continue if not icon_url.startswith('http') and not icon_url.startswith('//'): icon['href'] = urljoin(browser.url, icon_url) icons.append(icon) for meta in browser.select(META_SELECTOR): icon = meta.attrs icon_url = icon['content'] if icon_url.startswith('data:'): continue if not icon_url.startswith('http') and not icon_url.startswith('//'): icon['href'] = urljoin(browser.url, icon_url) else: icon['href'] = icon_url icons.append(icon) except Exception as e: logging.info(f'Exception: "{str(e)}" while parsing icon urls from document') pass # If the document doesn't specify favicon via rel attribute of link tag then check # if "favicon.ico" file is present in the root of the domain as some domains keep # favicon in their root without specifying them in the document. # Add the icon url if this is the case. if len(icons) == 0: default_favicon_url = f"{url}/favicon.ico" if is_url_reachable(default_favicon_url): icons.append({"href": default_favicon_url}) return icons def fix_url(url): fixed = url if not url.startswith('http'): fixed = 'https:{url}'.format(url=url) return fixed def get_best_icon(images): image_url = None image_width = 0 for image in images: url = fix_url(image.get('href')) width = None sizes = image.get('sizes') if sizes: try: width = int(sizes.split('x')[0]) except: pass if width is None: try: response = requests.get(url, headers={'User-agent': FIREFOX_UA}, timeout=60) # If it is an SVG, then return this as the best icon because SVG images are scalable, # can be printed with high quality at any resolution and SVG graphics do NOT # lose any quality if they are zoomed or resized. if response.headers.get('Content-Type') == 'image/svg+xml': # Firefox doesn't support masked icons yet. if 'mask' not in image: # If it is not then we want it. We are done here. return (url, SVG_ICON_WIDTH) else: logging.info(f'SVG icon "{image}" is masked') continue with Image.open(BytesIO(response.content)) as img: width, height = img.size if width != height: logging.info(f'icon shape "{width}*{height}" is not square') width = min(width, height) except Exception as e: logging.info(f'Exception: "{str(e)}" fetching (or opening) icon {url}') pass if width and width > image_width: image_url = url image_width = width return (image_url, image_width) def collect_icons_for_top_sites(topsitesfile, extrafile, count): results = [] extra_domains = [] if extrafile: # Add extra domains if extra file is provided by user extra_domains = extra_sites(extrafile) for rank, hostname in top_sites(topsitesfile, count) + extra_domains: # Skip NSFW and blacklisted sites if is_nsfw(hostname) or hostname in DOMAIN_EXCLUSION_LIST: continue url = 'https://{hostname}'.format(hostname=hostname) icons = fetch_icons(url) if len(icons) == 0 and 'www.' not in hostname: # Retry with www. in the hostname as some domains require it explicitly. url = f"https://www.{hostname}" icons = fetch_icons(url) best_icon_url, best_icon_width = get_best_icon(icons) results.append({ 'hostname': hostname, 'url': url, 'icons': icons, 'rank': rank, 'best_icon_url': best_icon_url, 'best_icon_width': best_icon_width }) logging.info('Done fetching icons') return results @click.command() @click.option('--count', default=10, help='Number of sites from a list of Top Sites that should be used to generate the manifest. Default is 10.') @click.option('--topsitesfile', type=click.Path(exists=True), help='A csv file containing comma separated rank and domain information (in the same order) of the Top Sites. If no file is provided then Alexa Top Sites are used.') @click.option('--extrafile', type=click.Path(exists=True), help='A csv file containing domain information of extra top sites. If no file is provided then no extra Top Sites.') @click.option('--minwidth', default=96, help='Minimum width of the site icon. Only those sites that satisfy this requirement are added to the manifest. Default is 96.') @click.option('--loadrawsitedata', help='Load the full data from the filename specified') @click.option('--saverawsitedata', help='Save the full data to the filename specified') def make_manifest(count, minwidth, topsitesfile, extrafile, saverawsitedata, loadrawsitedata): results = [] if loadrawsitedata: logging.info(f'Loading raw icon data from {loadrawsitedata}') with open(loadrawsitedata) as infile: sites_with_icons = json.loads(infile.read()) else: sites_with_icons = collect_icons_for_top_sites(topsitesfile, extrafile, count) if saverawsitedata: logging.info(f'Saving raw icon data to {saverawsitedata}') with open(saverawsitedata, 'w') as outfile: json.dump(sites_with_icons, outfile, indent=4) for site in sites_with_icons: hostname = site.get('hostname') url = site.get('url') icon = site.get('best_icon_url') icon_width = site.get('best_icon_width') # check if there is a best icon that satisfies the minwidth criteria if (icon is None) or ((icon_width != SVG_ICON_WIDTH) and (icon_width < minwidth)): logging.info(f'No icon for "{url}" (best icon width: {icon_width})') continue existing = next((x for x in results if x.get('image_url') == icon), None) if existing: existing.get('domains').append(hostname) else: results.append({ 'image_url': icon, 'domains': [hostname] }) # Sort alphabetically results = sorted(results, key=lambda site: site['domains'][0]) click.echo(json.dumps(results, indent=4)) if __name__ == '__main__': make_manifest()