in link-verifier/verify-links.py [0:0]
def main():
parser = argparse.ArgumentParser(
description='A script to test HTTP links, and all links in Markdown files.',
epilog='Requires beautifulsoup4, requests, and termcolor from PyPi. ' +
'Optional dependencies: pandoc (to support testing Markdown files), gh (To speed up checking GitHub links)'
)
parser.add_argument("-F", "--files", action="store", dest="files", nargs='+', help="List of Markdown files to test links in.")
parser.add_argument("-L", "--links", action="store", dest="links", nargs='+', help="List of links to test.")
parser.add_argument("-M", "--test-markdown", action="store_true", default=False, help="Enable search of Markdown files for testing links.")
parser.add_argument("-D", "--exclude-dirs", action="store", dest="exclude_dirs", nargs='+', help="List of directories to ignore.")
parser.add_argument("-I", "--include-file-types", action="store", dest="include_files", nargs='+', help="List of file patterns to search for URLs.")
parser.add_argument("-A", "--allowlist-file", action="store", dest="allowlist", help="Path to file containing list of allowed URLs.")
parser.add_argument("-n", "--num-processes", action="store", type=int, default=4, help="Number of processes to run in parallel")
parser.add_argument("-k", "--keep", action="store_true", default=False, help="Keep temporary files instead of deleting")
parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Print all links tested")
args = parser.parse_args()
html_file_list = []
broken_links = []
file_list = []
link_list = []
exclude_dirs = [dir.lower() for dir in args.exclude_dirs] if args.exclude_dirs else []
# If any explicit files are passed, add them to file_list.
if args.files is not None:
file_list = args.files
elif args.test_markdown:
# Obtain list of Markdown files from the repository (along with excluding passed directories).
for root, dirs, files in os.walk("./"):
# Prune dirs to remove exclude directories, if passed as command line input, from search.
dirs[:] = [dir for dir in dirs if dir.lower() not in exclude_dirs]
file_list += [os.path.join(root, f) for f in files if re.search(MARKDOWN_SEARCH_TERM, f, re.IGNORECASE)]
if args.verbose:
print(file_list)
# If any explicit links are passed, add them to link_list.
if args.links is not None:
link_list = args.links
elif args.include_files is not None:
for root, dirs, files in os.walk("./"):
# Avoid exclude directories, if passed, from search.
dirs[:] = [dir for dir in dirs if dir.lower() not in exclude_dirs]
for file in files:
if any(file.endswith(file_type) for file_type in args.include_files):
f_path = os.path.join(root, file)
print("Processing File: {}".format(f_path))
with open(f_path, 'r', encoding="utf8", errors='ignore') as f:
# errors='ignore' argument Suppresses UnicodeDecodeError
# when reading invalid UTF-8 characters.
text = f.read()
urls = re.findall(URL_SEARCH_TERM, text)
for url in urls:
if url[0] not in link_list:
link_list.append(url[0])
# If allowlist file is passed, add those links to link_cache so that link check on those URLs can be bypassed.
if args.allowlist is not None:
with open(args.allowlist, 'r') as file:
for link in file.read().strip('\n').split('\n'):
link_cache[link] = (False, 'Allowed')
try:
file_map = {}
for f in file_list:
process = create_html(f)
if process.returncode != 0:
cprint(process.stdout, 'red')
print('Did you install pandoc?')
sys.exit(process.returncode)
html_file_list.append(html_name_from_markdown(f))
# Create a map so that we know what file this was generated from.
file_map[html_name_from_markdown(f)] = f
# Parse files in parallel.
pool = Pool(args.num_processes)
file_objects = pool.map(parse_file, html_file_list)
pool.close()
pool.join()
for file_obj in file_objects:
consolidate_repo_list(file_obj.linked_repos)
# Test links in series so we don't send too many HTTP requests in a short interval.
for file_obj in file_objects:
file_obj.identify_broken_links(file_map, args.verbose)
broken_links += file_obj.broken_links
# Remove the temporary files we created, especially if there was an exception.
finally:
for f in html_file_list:
if not args.keep:
os.remove(f)
for link in link_list:
is_broken, status_code = test_url(link)
if is_broken:
broken_links.append(link)
cprint(f'{status_code}\t{link}', 'red')
else:
if args.verbose:
cprint(f'{status_code}\t{link}', 'green')
# Return code > 0 to return error.
num_broken = len(broken_links)
if num_broken > 0:
print(f'{num_broken} broken link' + ('s', '')[num_broken == 1])
sys.exit(num_broken != 0)