plugins/atlas-link-checker/atlas_link_checker/main.py (109 lines of code) (raw):
import argparse
import os.path
from collections import namedtuple
from glob import glob
from typing import List
from argparse import Namespace
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from .config import EXCLUDED_LINKS, MKDOCS_SITE_DIRECTORY, OLD_SITE_PREFIX
from .logconfig import setup_logging
logger = setup_logging(__name__)
LinkStatus = namedtuple('LinkStatus', ['old_site_links', 'bare_links', 'bad_links'])
def parse_args() -> Namespace:
parser = argparse.ArgumentParser('get discovery information')
parser.add_argument('--title', type=str, help=f'check links on matching page title')
parser.add_argument('--fname', type=str, help=f'check links on matching file name')
return parser.parse_args()
def html_files() -> List[str]:
if not os.path.isdir('site'):
raise FileNotFoundError('mkdocs site directory not found')
files = glob(f'{MKDOCS_SITE_DIRECTORY}/**/*.html', recursive=True)
logger.info(f'found {len(files)} html files in mkdocs site directory')
return files
def read_file(fname: str) -> str:
with open(fname) as f:
return f.read()
def skip_link(link: Tag) -> bool:
if link['href'] in EXCLUDED_LINKS:
return True
elif 'class' not in link.attrs:
return False
elif 'headerlink' in link['class']:
return True
elif True in [True for c in link['class'] if c.startswith('md-')]:
return True
else:
return False
def html_links(soup: BeautifulSoup) -> List[Tag]:
links: List[Tag] = []
for link in soup.find_all('a'):
if skip_link(link):
continue
links.append(link)
return links
def old_site_link(link: Tag) -> bool:
if link['href'].startswith(OLD_SITE_PREFIX):
return True
else:
return False
def bare_link(link: Tag) -> bool:
if link['href'].startswith('http://'):
return False
elif link['href'].startswith('https://'):
return False
else:
return True
def bad_link(link: Tag) -> bool:
try:
r = requests.get(link['href'], allow_redirects=False, timeout=0.5)
except requests.exceptions.ConnectionError:
return True
if r.ok:
return False
else:
return True
def check_links(soup: BeautifulSoup) -> LinkStatus:
links = html_links(soup)
old_site_links: List[Tag] = []
bare_links: List[Tag] = []
bad_links: List[Tag] = []
for link in links:
if old_site_link(link):
old_site_links.append(link)
if bare_link(link):
bare_links.append(link)
elif bad_link(link):
bad_links.append(link)
return LinkStatus(old_site_links, bare_links, bad_links)
def link_report(args: Namespace, fname: str) -> None:
if args.fname and fname != args.fname:
return
html = read_file(fname)
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.contents[0]
if args.title and title != args.title:
return
logger.info(f'==== {title}: {fname} ====')
link_status = check_links(soup)
if len(link_status.bare_links) > 0:
logger.warning('BARE LINKS:')
for link in link_status.bare_links:
logger.warning(f' {link}')
if len(link_status.old_site_links) > 0:
logger.warning('OLD SITE LINKS:')
for link in link_status.old_site_links:
logger.error(f' {link}')
if len(link_status.bad_links) > 0:
logger.error('BAD LINKS:')
for link in link_status.bad_links:
logger.error(f' {link}')
def main():
args = parse_args()
if args.fname or args.title:
logger.info(f'restricted to filename [{args.fname}] or title [{args.title}]')
for fname in html_files():
link_report(args, fname)
if __name__ == '__main__':
main()