plugins/atlas-link-checker/atlas_link_checker/main.py (109 lines of code) (raw):

import argparse import os.path from collections import namedtuple from glob import glob from typing import List from argparse import Namespace import requests from bs4 import BeautifulSoup from bs4.element import Tag from .config import EXCLUDED_LINKS, MKDOCS_SITE_DIRECTORY, OLD_SITE_PREFIX from .logconfig import setup_logging logger = setup_logging(__name__) LinkStatus = namedtuple('LinkStatus', ['old_site_links', 'bare_links', 'bad_links']) def parse_args() -> Namespace: parser = argparse.ArgumentParser('get discovery information') parser.add_argument('--title', type=str, help=f'check links on matching page title') parser.add_argument('--fname', type=str, help=f'check links on matching file name') return parser.parse_args() def html_files() -> List[str]: if not os.path.isdir('site'): raise FileNotFoundError('mkdocs site directory not found') files = glob(f'{MKDOCS_SITE_DIRECTORY}/**/*.html', recursive=True) logger.info(f'found {len(files)} html files in mkdocs site directory') return files def read_file(fname: str) -> str: with open(fname) as f: return f.read() def skip_link(link: Tag) -> bool: if link['href'] in EXCLUDED_LINKS: return True elif 'class' not in link.attrs: return False elif 'headerlink' in link['class']: return True elif True in [True for c in link['class'] if c.startswith('md-')]: return True else: return False def html_links(soup: BeautifulSoup) -> List[Tag]: links: List[Tag] = [] for link in soup.find_all('a'): if skip_link(link): continue links.append(link) return links def old_site_link(link: Tag) -> bool: if link['href'].startswith(OLD_SITE_PREFIX): return True else: return False def bare_link(link: Tag) -> bool: if link['href'].startswith('http://'): return False elif link['href'].startswith('https://'): return False else: return True def bad_link(link: Tag) -> bool: try: r = requests.get(link['href'], allow_redirects=False, timeout=0.5) except requests.exceptions.ConnectionError: return True if r.ok: return False else: return True def check_links(soup: BeautifulSoup) -> LinkStatus: links = html_links(soup) old_site_links: List[Tag] = [] bare_links: List[Tag] = [] bad_links: List[Tag] = [] for link in links: if old_site_link(link): old_site_links.append(link) if bare_link(link): bare_links.append(link) elif bad_link(link): bad_links.append(link) return LinkStatus(old_site_links, bare_links, bad_links) def link_report(args: Namespace, fname: str) -> None: if args.fname and fname != args.fname: return html = read_file(fname) soup = BeautifulSoup(html, 'html.parser') title = soup.title.contents[0] if args.title and title != args.title: return logger.info(f'==== {title}: {fname} ====') link_status = check_links(soup) if len(link_status.bare_links) > 0: logger.warning('BARE LINKS:') for link in link_status.bare_links: logger.warning(f' {link}') if len(link_status.old_site_links) > 0: logger.warning('OLD SITE LINKS:') for link in link_status.old_site_links: logger.error(f' {link}') if len(link_status.bad_links) > 0: logger.error('BAD LINKS:') for link in link_status.bad_links: logger.error(f' {link}') def main(): args = parse_args() if args.fname or args.title: logger.info(f'restricted to filename [{args.fname}] or title [{args.title}]') for fname in html_files(): link_report(args, fname) if __name__ == '__main__': main()