def main()

in link-verifier/verify-links.py [0:0]


def main():
    parser = argparse.ArgumentParser(
        description='A script to test HTTP links, and all links in Markdown files.',
        epilog='Requires beautifulsoup4, requests, and termcolor from PyPi. ' +
               'Optional dependencies: pandoc (to support testing Markdown files), gh (To speed up checking GitHub links)'
    )
    parser.add_argument("-F", "--files", action="store", dest="files", nargs='+', help="List of Markdown files to test links in.")
    parser.add_argument("-L", "--links", action="store", dest="links", nargs='+', help="List of links to test.")
    parser.add_argument("-M", "--test-markdown", action="store_true", default=False, help="Enable search of Markdown files for testing links.")
    parser.add_argument("-D", "--exclude-dirs", action="store", dest="exclude_dirs", nargs='+', help="List of directories to ignore.")
    parser.add_argument("-I", "--include-file-types", action="store", dest="include_files", nargs='+', help="List of file patterns to search for URLs.")
    parser.add_argument("-A", "--allowlist-file", action="store", dest="allowlist", help="Path to file containing list of allowed URLs.")
    parser.add_argument("-n", "--num-processes", action="store", type=int, default=4, help="Number of processes to run in parallel")
    parser.add_argument("-k", "--keep", action="store_true", default=False, help="Keep temporary files instead of deleting")
    parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Print all links tested")
    args = parser.parse_args()

    html_file_list = []
    broken_links = []
    file_list = []
    link_list = []
    exclude_dirs = [dir.lower() for dir in args.exclude_dirs] if args.exclude_dirs else []

    # If any explicit files are passed, add them to file_list.
    if args.files is not None:
        file_list = args.files
    elif args.test_markdown:
        # Obtain list of Markdown files from the repository (along with excluding passed directories).
        for root, dirs, files in os.walk("./"):
            # Prune dirs to remove exclude directories, if passed as command line input, from search.
            dirs[:] = [dir for dir in dirs if dir.lower() not in exclude_dirs]
            file_list += [os.path.join(root, f) for f in files if re.search(MARKDOWN_SEARCH_TERM, f, re.IGNORECASE)]

    if args.verbose:
        print(file_list)

    # If any explicit links are passed, add them to link_list.
    if args.links is not None:
        link_list = args.links
    elif args.include_files is not None:
        for root, dirs, files in os.walk("./"):
            # Avoid exclude directories, if passed, from search.
            dirs[:] = [dir for dir in dirs if dir.lower() not in exclude_dirs]
            for file in files:
                if any(file.endswith(file_type) for file_type in args.include_files):
                    f_path = os.path.join(root, file)
                    print("Processing File: {}".format(f_path))
                    with open(f_path, 'r', encoding="utf8", errors='ignore') as f:
                        # errors='ignore' argument Suppresses UnicodeDecodeError
                        # when reading invalid UTF-8 characters.
                        text = f.read()
                        urls = re.findall(URL_SEARCH_TERM, text)
                        for url in urls:
                            if url[0] not in link_list:
                                link_list.append(url[0])

    # If allowlist file is passed, add those links to link_cache so that link check on those URLs can be bypassed.
    if args.allowlist is not None:
        with open(args.allowlist, 'r') as file:
            for link in file.read().strip('\n').split('\n'):
                link_cache[link] = (False, 'Allowed')

    try:
        file_map = {}
        for f in file_list:
            process = create_html(f)
            if process.returncode != 0:
                cprint(process.stdout, 'red')
                print('Did you install pandoc?')
                sys.exit(process.returncode)
            html_file_list.append(html_name_from_markdown(f))
            # Create a map so that we know what file this was generated from.
            file_map[html_name_from_markdown(f)] = f

        # Parse files in parallel.
        pool = Pool(args.num_processes)
        file_objects = pool.map(parse_file, html_file_list)
        pool.close()
        pool.join()
        for file_obj in file_objects:
            consolidate_repo_list(file_obj.linked_repos)
        # Test links in series so we don't send too many HTTP requests in a short interval.
        for file_obj in file_objects:
            file_obj.identify_broken_links(file_map, args.verbose)
            broken_links += file_obj.broken_links
    # Remove the temporary files we created, especially if there was an exception.
    finally:
        for f in html_file_list:
            if not args.keep:
                os.remove(f)

    for link in link_list:
        is_broken, status_code = test_url(link)
        if is_broken:
            broken_links.append(link)
            cprint(f'{status_code}\t{link}', 'red')
        else:
            if args.verbose:
                cprint(f'{status_code}\t{link}', 'green')

    # Return code > 0 to return error.
    num_broken = len(broken_links)
    if num_broken > 0:
        print(f'{num_broken} broken link' + ('s', '')[num_broken == 1])
    sys.exit(num_broken != 0)