packages/python-packages/doc-warden/warden/enforce_readme_content.py (153 lines of code) (raw):
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from __future__ import print_function
import os
import markdown2
import bs4
import re
from .warden_common import check_match, walk_directory_for_pattern, get_omitted_files
from .HeaderConstruct import HeaderConstruct
from docutils import core
from docutils.writers.html4css1 import Writer,HTMLTranslator
import logging
README_PATTERNS = ['*/readme.md', '*/readme.rst', '*/README.md', '*/README.rst']
CODE_FENCE_REGEX = r"\`\`\`([\s\S\n]*?)\`\`\`"
# entry point
def verify_readme_content(config):
all_readmes = walk_directory_for_pattern(config.target_directory, README_PATTERNS, config)
omitted_readmes = get_omitted_files(config)
targeted_readmes = [readme for readme in all_readmes if readme not in omitted_readmes]
known_issue_paths = config.get_known_content_issues()
section_sorting_dict = config.required_readme_sections
ignored_missing_readme_paths = []
readme_results = []
readmes_with_issues = []
for readme in targeted_readmes:
ext = os.path.splitext(readme)[1]
if ext == '.rst':
readme_results.append(verify_rst_readme(readme, config, section_sorting_dict))
else:
readme_results.append(verify_md_readme(readme, config, section_sorting_dict))
for readme_tuple in readme_results:
if readme_tuple[1]:
if readme_tuple[0] in known_issue_paths:
ignored_missing_readme_paths.append(readme_tuple)
else:
readmes_with_issues.append(readme_tuple)
return readmes_with_issues, ignored_missing_readme_paths
# parse rst to html, check for presence of appropriate sections
def verify_rst_readme(readme, config, section_sorting_dict):
with open(readme, 'r', encoding="utf-8") as f:
readme_content = f.read()
html_readme_content = rst_to_html(readme_content)
html_soup = bs4.BeautifulSoup(html_readme_content, "html.parser")
missed_patterns = find_missed_sections(html_soup, config.required_readme_sections)
return (readme, missed_patterns)
# parse md to html, check for presence of appropriate sections
def verify_md_readme(readme, config, section_sorting_dict):
if config.verbose_output:
print('Examining content in {}'.format(readme))
with open(readme, 'r', encoding="utf-8-sig") as f:
readme_content = f.read()
# we need to sanitize to remove the fenced code blocks. The reasoning here is that markdown2 is having issues
# parsing the pygments style that we use with github.
sanitized_html_content = re.sub(CODE_FENCE_REGEX, "", readme_content, flags=re.MULTILINE)
html_readme_content = markdown2.markdown(sanitized_html_content)
html_soup = bs4.BeautifulSoup(html_readme_content, "html.parser")
missed_patterns = find_missed_sections(html_soup, config.required_readme_sections)
return (readme, missed_patterns)
# within the entire readme, are there any missing sections that are expected?
def find_missed_sections(html_soup, patterns):
header_list = html_soup.find_all(re.compile('^h[1-4]$'))
flattened_patterns = flatten_pattern_config(patterns)
header_index = generate_header_index(header_list, flattened_patterns)
observed_failing_patterns = recursive_header_search(header_index, patterns, [])
return observed_failing_patterns
# gets a distinct set of ALL patterns present in a config. This is
# important because this allows us to precalculate which patterns a given header tag will match
def flatten_pattern_config(patterns):
observed_patterns = []
for pattern in patterns:
if isinstance(pattern, dict):
parent_pattern, child_patterns = next(iter(pattern.items()))
if child_patterns:
observed_patterns.extend(flatten_pattern_config(child_patterns))
observed_patterns.extend([parent_pattern])
else:
observed_patterns.extend([pattern])
return list(set(observed_patterns))
# recursive solution that walks all the rules and generates rule chains from them to test
# that the tree actually contains sets of headers that meet the required sections
def recursive_header_search(header_index, patterns, parent_pattern_chain=[]):
unobserved_patterns = []
if patterns:
for pattern in patterns:
if isinstance(pattern, dict):
parent_pattern, child_patterns = next(iter(pattern.items()))
if not match_regex_to_headers(header_index, parent_pattern_chain + [parent_pattern]):
unobserved_patterns.append(parent_pattern_chain + [parent_pattern])
parent_chain_for_children = parent_pattern_chain + [parent_pattern]
unobserved_patterns.extend(recursive_header_search(header_index, child_patterns, parent_chain_for_children))
else:
if not match_regex_to_headers(header_index, parent_pattern_chain + [pattern]):
unobserved_patterns.append((parent_pattern_chain + [pattern]))
return unobserved_patterns
# a set of headers looks like this
# h1
# h2
# h1
# h2
# h3
# h1
# any "indented" headers are children of the one above it IF the
# one above it is at a higher header level (this is actually < in comparison)
# result of above should be a web that looks like
# root
# h1
# h2
# h1
# h2
# h3
# h1
# this function examines a serial set of <h> tags and generates
# an index that allows us to interrogate a specific header for it's containing
# headers.
def generate_header_index(header_constructs, patterns):
previous_header_level = 0
current_header = None
root = HeaderConstruct(None, None)
current_parent = root
header_index = []
previous_node_level = 0
for index, header in enumerate(header_constructs):
# evaluate the level
current_level = int(header.name.replace('h', ''))
# h1 < h2 == we need to traverse up
if current_level < current_parent.level:
current_parent = current_parent.get_parent_by_level(current_level)
current_header = HeaderConstruct(header, current_parent, patterns)
# h2 > h1 == we need to indent, add the current as a child, and set parent to current
# for the forthcoming headers
elif current_level > current_parent.level:
current_header = HeaderConstruct(header, current_parent, patterns)
# only set current_parent if there are children below, which NECESSITATES that
# the very next header must A) exist and B) be > current_level
if index + 1 < len(header_constructs):
if int(header_constructs[index+1].name.replace('h', '')) > current_level:
current_parent = current_header
# current_header.level == current_parent.level
# we just need to add it as a child to our current header
else:
if previous_node_level > current_parent.level:
current_parent = current_parent.get_parent_by_level(current_level)
current_header = HeaderConstruct(header, current_parent, patterns)
previous_node_level = current_level
# always add the header to the node index, we will use it later
header_index.append(current_header)
return header_index
# checks the node index for a specific pattern or chain
# [^Getting started$, Install Package] is an example of a required set
def match_regex_to_headers(header_index, target_patterns):
# we should only be firing this for a "leaf" aka the END of the chain we're looking for, so the last element
# will always get popped first before we recurse across the rest
current_target = target_patterns.pop()
matching_headers = [header for header in header_index if current_target in header.matching_patterns]
# check all the leaf node parents for the matches. we don't want to artificially constrain though
# so we have to assume that a rule can match multiple children
for matching_leaf_header in matching_headers:
if target_patterns:
result = check_header_parents(matching_leaf_header, target_patterns[:])
else:
return re.search(current_target, matching_leaf_header.get_tag_text())
if result:
return matching_leaf_header
else:
continue
return None
# recursively ensure that a header_construct has parents that match the required headers
# the search ALLOWS GAPS, so a match will still be found if
#
# h1
# h2 (matching header)
# h3 (unmatched parent header, but this is ok)
# h4 (matching header)
def check_header_parents(header_construct, required_parent_headers):
if required_parent_headers:
target_parent = required_parent_headers.pop()
new_parent = header_construct.check_parents_for_pattern(target_parent)
if new_parent:
if required_parent_headers:
check_header_parents(header_construct, required_parent_headers)
else:
return True
else:
return False
else:
return False
# checks a header string against a set of configured patterns
def match_regex_set(header, patterns):
matching_patterns = []
for pattern in patterns:
result = re.search(pattern, header)
if result:
matching_patterns.append(pattern)
break
return matching_patterns
# boilerplate for translating RST
class HTMLFragmentTranslator(HTMLTranslator):
def __init__(self, document):
HTMLTranslator.__init__(self, document)
self.head_prefix = ['','','','','']
self.body_prefix = []
self.body_suffix = []
self.stylesheet = []
def astext(self):
return ''.join(self.body)
html_fragment_writer = Writer()
html_fragment_writer.translator_class = HTMLFragmentTranslator
# utilize boilerplate
def rst_to_html(input_rst):
return core.publish_string(input_rst, writer = html_fragment_writer)