sagemaker_image_builder/package_report.py (237 lines of code) (raw):
import json
import os
from itertools import islice
import conda.cli.python_api
from conda.models.match_spec import MatchSpec
from sagemaker_image_builder.dependency_upgrader import _dependency_metadata
from sagemaker_image_builder.utils import (
create_markdown_table,
get_dir_for_version,
get_match_specs,
get_semver,
pull_conda_package_metadata,
sizeof_fmt,
)
def _get_package_versions_in_upstream(target_packages_match_spec_out, target_version) -> dict[str, str]:
package_to_version_mapping = {}
is_major_version_release = target_version.minor == 0 and target_version.patch == 0
is_minor_version_release = target_version.patch == 0 and not is_major_version_release
for package in target_packages_match_spec_out:
# Execute a conda search api call in the linux-64 subdirectory
# packages such as pytorch-gpu are present only in linux-64 sub directory
match_spec_out = target_packages_match_spec_out[package]
package_version = str(match_spec_out.get("version")).removeprefix("==")
package_version = get_semver(package_version)
channel = match_spec_out.get("channel").channel_name
subdir_filter = "[subdir=" + match_spec_out.get("subdir") + "]"
search_result = conda.cli.python_api.run_command(
"search", channel + "::" + package + ">=" + str(package_version) + subdir_filter, "--json"
)
# Load the first result as json. The API sends a json string inside an array
package_metadata = json.loads(search_result[0])[package]
# Response is of the structure
# { 'package_name': [{'url':<someurl>, 'dependencies': <List of dependencies>, 'version':
# <version number>}, ..., {'url':<someurl>, 'dependencies': <List of dependencies>, 'version':
# <version number>}]
# We only care about the version number in the last index
package_version_in_conda = ""
if is_major_version_release:
latest_package_version_in_conda = package_metadata[-1]["version"]
elif is_minor_version_release:
package_major_version_prefix = str(package_version.major) + "."
latest_package_version_in_conda = [
x["version"] for x in package_metadata if x["version"].startswith(package_major_version_prefix)
][-1]
else:
package_minor_version_prefix = ".".join([str(package_version.major), str(package_version.minor)])
latest_package_version_in_conda = [
x["version"] for x in package_metadata if x["version"].startswith(package_minor_version_prefix)
][-1]
package_to_version_mapping[package] = latest_package_version_in_conda
return package_to_version_mapping
def _generate_staleness_report_per_image(
package_versions_in_upstream, target_packages_match_spec_out, image_config, version
):
print("\n# Staleness Report: " + str(version) + "(" + image_config["image_type"] + ")\n")
print("Package | Current Version in the image | Latest Relevant Version in " "Upstream")
print("---|---|---")
for package in package_versions_in_upstream:
version_in_image = str(target_packages_match_spec_out[package].get("version")).removeprefix("==")
if version_in_image == package_versions_in_upstream[package]:
print(package + "|" + version_in_image + "|" + package_versions_in_upstream[package])
else:
print(
"${\color{red}" + package + "}$" + "|" + version_in_image + "|" + package_versions_in_upstream[package]
)
def _get_installed_package_versions_and_conda_versions(
image_config, target_version_dir, target_version
) -> (dict[str, MatchSpec], dict[str, str]):
env_in_file_name = image_config["build_args"]["ENV_IN_FILENAME"]
env_out_file_name = image_config["env_out_filename"]
required_packages_from_target = get_match_specs(target_version_dir + "/" + env_in_file_name).keys()
match_spec_out = get_match_specs(target_version_dir + "/" + env_out_file_name)
# We only care about packages which are present in env.in
# Remove Python from the dictionary, we don't want to track python version as part of our
# staleness report.
target_packages_match_spec_out = {
k: v for k, v in match_spec_out.items() if k in required_packages_from_target and k not in _dependency_metadata
}
latest_package_versions_in_upstream = _get_package_versions_in_upstream(
target_packages_match_spec_out, target_version
)
return target_packages_match_spec_out, latest_package_versions_in_upstream
def _validate_new_package_size(new_package_total_size, target_total_size, image_type, target_version):
# Validate if the new packages account for <= 5% of the total python package size of target image.
new_package_total_size_percent_threshold = 5
validate_result = None
new_package_total_size_percent = round(new_package_total_size / target_total_size * 100, 2)
new_package_total_size_percent_string = str(new_package_total_size_percent)
if new_package_total_size_percent > new_package_total_size_percent_threshold:
validate_result = (
"The total size of newly introduced Python packages accounts for more than "
+ str(new_package_total_size_percent_threshold)
+ "% of the total Python package size of "
+ image_type
+ " image, version "
+ str(target_version)
+ "! ("
+ str(new_package_total_size_percent)
+ "%)"
)
new_package_total_size_percent_string = "${\color{red}" + str(new_package_total_size_percent) + "}$"
print(
"The total size of newly introduced Python packages is "
+ sizeof_fmt(new_package_total_size)
+ ", accounts for "
+ new_package_total_size_percent_string
+ "% of the total package size."
)
return validate_result
def _validate_new_package_size(new_package_total_size, target_total_size, image_type, target_version):
# Validate if the new packages account for <= 5% of the total python package size of target image.
new_package_total_size_percent_threshold = 5
validate_result = None
new_package_total_size_percent = round(new_package_total_size / target_total_size * 100, 2)
new_package_total_size_percent_string = str(new_package_total_size_percent)
if new_package_total_size_percent > new_package_total_size_percent_threshold:
validate_result = (
"The total size of newly introduced Python packages accounts for more than "
+ str(new_package_total_size_percent_threshold)
+ "% of the total Python package size of "
+ image_type
+ " image, version "
+ str(target_version)
+ "! ("
+ str(new_package_total_size_percent)
+ "%)"
)
new_package_total_size_percent_string = "${\color{red}" + str(new_package_total_size_percent) + "}$"
print(
"The total size of newly introduced Python packages is "
+ sizeof_fmt(new_package_total_size)
+ ", accounts for "
+ new_package_total_size_percent_string
+ "% of the total package size."
)
return validate_result
def _generate_python_package_size_report_per_image(
base_pkg_metadata, target_pkg_metadata, image_config, base_version, target_version
):
validate_result = None
image_type = image_config["image_type"].upper()
print("\n# Python Package Size Report " + "(" + image_type + ")\n")
print("\n### Target Image Version: " + str(target_version) + " | Base Image Version: " + str(base_version) + "\n")
if not base_pkg_metadata or not base_version:
print("WARNING: No Python package metadata file found for base image, only partial results will be shown.")
base_total_size = sum(d["size"] for d in base_pkg_metadata.values()) if base_pkg_metadata else None
# Print out the total size change of all Python packages in the image.
target_total_size = sum(d["size"] for d in target_pkg_metadata.values())
total_size_delta_val = (target_total_size - base_total_size) if base_total_size else None
total_size_delta_rel = (total_size_delta_val / base_total_size) if base_total_size else None
print("\n## Python Packages Total Size Summary\n")
print(
create_markdown_table(
["Target Version Total Size", "Base Version Total Size", "Size Change (abs)", "Size Change (%)"],
[
{
"target_total_size": sizeof_fmt(target_total_size),
"base_total_size": sizeof_fmt(base_total_size) if base_total_size else "-",
"size_delta_val": sizeof_fmt(total_size_delta_val) if total_size_delta_val else "-",
"size_delta_rel": str(round(total_size_delta_rel * 100, 2)) if total_size_delta_rel else "-",
}
],
)
)
# Print out the largest 20 Python packages in the image, sorted decending by size.
print("\n## Top-20 Largest Python Packages\n")
print(
create_markdown_table(
["Package", "Version in the Target Image", "Size"],
[
{"pkg": k, "version": v["version"], "size": sizeof_fmt(v["size"])}
for k, v in islice(target_pkg_metadata.items(), None, 20)
],
)
)
# Print out the size delta for each changed/new package in the image, sorted decending by size.
if base_pkg_metadata:
print("\n## Python Package Size Delta\n")
new_package_total_size = 0
package_size_delta_list = []
for k, v in target_pkg_metadata.items():
if k not in base_pkg_metadata or base_pkg_metadata[k]["version"] != v["version"]:
base_pkg_size = base_pkg_metadata[k]["size"] if k in base_pkg_metadata else 0
size_delta_abs = v["size"] - base_pkg_size
package_size_delta_list.append(
{
"package": k,
"target_version": v["version"],
"base_version": base_pkg_metadata[k]["version"] if k in base_pkg_metadata else "-",
"size_delta_abs": size_delta_abs,
"size_delta_rel": (size_delta_abs / base_pkg_size) if base_pkg_size else None,
}
)
if k not in base_pkg_metadata:
new_package_total_size += v["size"]
# Sort the package size delta based on absolute size diff in decending order.
package_size_delta_list = sorted(package_size_delta_list, key=lambda item: item["size_delta_abs"], reverse=True)
for v in package_size_delta_list:
v["size_delta_rel"] = str(round(v["size_delta_rel"] * 100, 2)) if v["size_delta_rel"] else "-"
v["size_delta_abs"] = sizeof_fmt(v["size_delta_abs"])
validate_result = _validate_new_package_size(
new_package_total_size, target_total_size, image_type, target_version
)
print(
create_markdown_table(
[
"Package",
"Version in the Target Image",
"Version in the Base Image",
"Size Change (abs)",
"Size Change (%)",
],
package_size_delta_list,
)
)
return validate_result
def generate_package_staleness_report(args):
with open(args.image_config_file) as jsonfile:
image_configs = json.load(jsonfile)
target_version = get_semver(args.target_patch_version)
target_version_dir = get_dir_for_version(target_version)
for image_config in image_configs:
(
target_packages_match_spec_out,
latest_package_versions_in_upstream,
) = _get_installed_package_versions_and_conda_versions(image_config, target_version_dir, target_version)
_generate_staleness_report_per_image(
latest_package_versions_in_upstream, target_packages_match_spec_out, image_config, target_version
)
def generate_package_size_report(args):
with open(args.image_config_file) as jsonfile:
_image_generator_configs = json.load(jsonfile)
target_version = get_semver(args.target_patch_version)
target_version_dir = get_dir_for_version(target_version)
base_version = None
source_version_txt_file_path = f"{target_version_dir}/source-version.txt"
if os.path.exists(source_version_txt_file_path):
with open(source_version_txt_file_path, "r") as f:
source_patch_version = f.readline()
base_version = get_semver(source_patch_version)
base_version_dir = get_dir_for_version(base_version) if base_version else None
validate_results = []
for image_config in _image_generator_configs:
base_pkg_metadata = pull_conda_package_metadata(image_config, base_version_dir) if base_version else None
target_pkg_metadata = pull_conda_package_metadata(image_config, target_version_dir)
validate_result = _generate_python_package_size_report_per_image(
base_pkg_metadata, target_pkg_metadata, image_config, base_version, target_version
)
if validate_result:
validate_results.append(validate_result)
if args.validate:
if validate_results:
raise Exception(f"Size Validation Failed! Issues found: {validate_results}")
print("Pakcage Size Validation Passed!")