packages/python-packages/doc-warden/warden/warden_common.py (207 lines of code) (raw):
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import fnmatch
import re
import xml.etree.ElementTree as ET
import pathlib2
# python 3 transitioned StringIO to be part of `io` module.
# python 2 needs the old version however
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
JS_PACKAGE_DISCOVERY_PATTERN = "*/package.json"
PYTHON_PACKAGE_DISCOVERY_PATTERN = "*/setup.py"
NET_PACKAGE_DISCOVERY_PATTERN = "*.csproj"
JAVA_PACKAGE_DISCOVERY_PATTERN = "*/pom.xml"
SWIFT_PACKAGE_DISCOVERY_PATTERN = "*/project.pbxproj"
# we want to walk the files as few times as possible. as such, for omitted_files, we provide a SET
# of patterns that we want to omit. This function simply checks
# directory mode being enabled will activate slightly different logic, and will do an additional match
# based on just the specific directory.
def check_match(file_path, normalized_target_patterns, directory_mode=False):
return any(
[
fnmatch.fnmatch(file_path, normalized_target_pattern)
or (
directory_mode
and os.path.dirname(file_path) == normalized_target_pattern
)
for normalized_target_pattern in normalized_target_patterns
]
)
def get_java_package_roots(configuration):
file_set = get_file_sets(
configuration, JAVA_PACKAGE_DISCOVERY_PATTERN, is_java_pom_package_pom
)
if configuration.verbose_output:
print(file_set)
return file_set
def get_net_package(configuration):
file_set = get_file_sets(
configuration, NET_PACKAGE_DISCOVERY_PATTERN, is_net_csproj_package
)
if configuration.verbose_output:
print(file_set)
return file_set
def get_project_roots_from_pbxproj_paths(pbxproj_file_set):
project_roots = []
for pbxproj_file in pbxproj_file_set:
pbxproj_file_path = pathlib2.Path(pbxproj_file)
project_root_path = pbxproj_file_path.parents[1]
project_roots.append(str(project_root_path))
return project_roots
def get_swift_package_roots(configuration):
project_files, omitted_project_files = get_file_sets(
configuration, SWIFT_PACKAGE_DISCOVERY_PATTERN
)
project_roots = get_project_roots_from_pbxproj_paths(project_files)
omitted_project_roots = get_project_roots_from_pbxproj_paths(omitted_project_files)
file_set = project_roots, omitted_project_roots
if configuration.verbose_output:
print(file_set)
return file_set
def get_python_package_roots(configuration):
file_set = get_file_sets(configuration, PYTHON_PACKAGE_DISCOVERY_PATTERN)
if configuration.verbose_output:
print(file_set)
return file_set
def get_js_package_roots(configuration):
file_set = get_file_sets(configuration, JS_PACKAGE_DISCOVERY_PATTERN)
if configuration.verbose_output:
print(file_set)
return file_set
# returns the two sets:
# the set of files where we expect a target_file to be present
# and the set of files that we expect a target_file to be present that have been explicitly omitted
def get_file_sets(configuration, target_pattern, lambda_check=None):
expected_locations = walk_directory_for_pattern(
configuration.target_directory, [target_pattern], configuration, lambda_check
)
omitted_files = get_omitted_files(configuration)
return (
list(set(expected_locations) - set(omitted_files)),
list(set(omitted_files).intersection(expected_locations)),
)
# gets the set of files in the target directory that have explicitly been omitted in the config settings
def get_omitted_files(configuration):
repo_root = configuration.repo_root
omitted_paths = []
dirs = configuration.omitted_paths or []
# single special case here. if wildcard match at the beginning, do not join, use the pattern as is
adjusted_dirs = [
pattern if pattern.startswith("*") else os.path.join(repo_root, pattern)
for pattern in dirs
]
omitted_paths.extend(
walk_directory_for_pattern(
repo_root, adjusted_dirs, configuration, None, True
)
)
return omitted_paths
# convention. omit test projects
def is_net_csproj_package(file_path):
test_proj_exclude = re.compile(
".*(\\\\|\/)(tests|samples)(\\\\|\/).*|.*test[s]?(\\|\/).csproj", re.IGNORECASE
)
if test_proj_exclude.match(file_path):
return False
return True
# Returns a list of files under a target directory. The files included will match any of the
# target_patterns AND the lambda_check function.
def walk_directory_for_pattern(
target_directory,
target_patterns,
configuration,
lambda_check=None,
directory_mode=False,
):
expected_locations = []
target_directory = os.path.normpath(target_directory)
normalized_target_patterns = [
os.path.normpath(pattern) for pattern in target_patterns
]
return_true = lambda x: True
check_function = lambda_check or return_true
# walk the folders, filter to the patterns established
for folder, subfolders, files in os.walk(target_directory):
for file in files:
file_path = os.path.join(folder, file)
if check_match(file_path, normalized_target_patterns, directory_mode):
if configuration.verbose_output:
print(
"Pattern matched {}. Running Check Function.".format(file_path)
)
if check_function(file_path):
expected_locations.append(file_path)
return expected_locations
# given a file location or folder, check within or alongside for a target file
# case insensitive
def find_alongside_file(file_location, target):
if not os.path.exists(file_location) or not target:
return False
rule = re.compile(fnmatch.translate(target), re.IGNORECASE)
containing_folder = ""
if os.path.isdir(file_location):
# we're already looking at a file location. just check for presence of target in listdir
containing_folder = file_location
else:
# os.path.listdir(os.path.dirname(file_location))
containing_folder = os.path.dirname(file_location)
for file in os.listdir(containing_folder):
if file.lower() == target.lower():
return os.path.normpath(os.path.join(containing_folder, file))
return False
# find's the first file that matches a glob pattern under a target file's location
# case insensitive
def find_below_file(glob_pattern, file):
if not os.path.exists(file) or not glob_pattern or os.path.isdir(file):
return None
rule = re.compile(fnmatch.translate(glob_pattern), re.IGNORECASE)
target_directory = os.path.dirname(file)
for folder, subfolders, files in os.walk(target_directory):
for file in files:
file_path = os.path.join(folder, file)
if rule.match(file):
return file_path
# searches upwards along from a specified file for a pattern
# glob pattern is the pattern we're matching against. often just a filename
# file is the file we're starting from
# path_exclusion_list the list of paths we should hard stop traversing up on if we haven't already exited
# early_exit_lambda_check a specific check that isn't only based on file. for .net we check to see of a .sln is present in the directory
def find_above_file(
glob_pattern, file, path_exclusion_list, early_exit_lambda_check, root_directory
):
if not os.path.exists(file) or not glob_pattern or os.path.isdir(file):
return None
if (
path_exclusion_list is None or len(path_exclusion_list) == 0
) and early_exit_lambda_check is None:
print(
"Using find_above_file without at least one member set for package_indexing_traversal_stops in .docsettings OR setting an early_exit_lambda_check is disallowed. Exiting."
)
exit(1)
complete_exclusion_list = path_exclusion_list + [root_directory]
if early_exit_lambda_check is None:
early_exit_lambda_check = lambda path: True
target_rule = re.compile(fnmatch.translate(glob_pattern), re.IGNORECASE)
file_dir = os.path.dirname(file)
while not check_folder_against_exclusion_list(file_dir, complete_exclusion_list):
for file in os.listdir(file_dir):
if target_rule.match(file):
return os.path.normpath(os.path.join(file_dir, file))
# the early_exit_lambda check runs after we're done scanning the current directory for matches
if early_exit_lambda_check(file_dir):
return None
file_dir = os.path.abspath(os.path.join(file_dir, "../"))
return None
# True if folder matches anything in the exclusion list
# False if not
def check_folder_against_exclusion_list(folder, path_exclusion_list):
if not os.path.isdir(folder):
return True
return os.path.normpath(folder) in path_exclusion_list
# given a pom.xml, crack it open and ensure that it is actually a package pom (versus a parent pom)
def is_java_pom_package_pom(file_path):
root = parse_pom(file_path)
artifactIdTag = root.find("parent/artifactId")
if artifactIdTag is not None:
return (
artifactIdTag.text == "azure-client-sdk-parent"
or artifactIdTag.text == "azure-data-sdk-parent"
)
return False
# namespaces in xml really mess with xmlTree: https://bugs.python.org/issue18304
# this function provides a workaround for both parsing an xml file as well as REMOVING said namespaces
def parse_pom(file_path):
try:
with open(file_path, "r", encoding="utf-8") as f:
xml = f.read()
except Exception as ex:
print("Invalid XML in {}".format(file_path))
raise ex
it = ET.iterparse(StringIO(xml))
for _, el in it:
if "}" in el.tag:
el.tag = el.tag.split("}", 1)[1]
return it.root
# Parse csproj file to get version property
def parse_csproj(file_path):
try:
with open(file_path, "r", encoding="utf-8") as f:
xml = f.read()
except Exception as ex:
print("Invalid XML in {}".format(file_path))
raise ex
it = ET.iterparse(StringIO(xml))
for _, el in it:
if el.tag == "Version":
return el.text
break
return ""