in link-verifier/verify-links.py [0:0]
def __init__(self, html_file_name):
"""Parse html in file and extract links and ids"""
self.ids = []
self.internal_links = []
self.external_links = []
self.name = html_file_name
self.abspath = os.path.abspath(html_file_name)
self.broken_links = []
self.linked_repos = {}
with open(html_file_name, 'r') as infile:
html_data = infile.read()
dirname = os.path.dirname(self.name)
soup = BeautifulSoup(html_data, 'html.parser')
# Find IDs. This is to check internal links within a file.
for tag in soup.find_all(True, {'id': True}):
self.ids.append(tag.get('id'))
pr_search = re.compile(PULL_REQUEST_SEARCH)
issue_search = re.compile(ISSUE_SEARCH)
for tag in soup.find_all('a'):
link = tag.get('href')
if not re.search(HTTP_URL_SEARCH_TERM, link, re.IGNORECASE):
if not re.search(IGNORED_LINK_SCHEMES, link, re.IGNORECASE):
if link is not None and link not in self.internal_links:
self.internal_links.append(link)
else:
if link is not None and link not in self.external_links:
self.external_links.append(link)
pr_match = pr_search.search(link)
if pr_match:
self.increment_gh_link_count(pr_match.group(1), pr_match.group(2), pr_match.group(3), True)
else:
issue_match = issue_search.search(link)
if issue_match:
self.increment_gh_link_count(issue_match.group(1), issue_match.group(2), issue_match.group(3), False)