Skip to content

Commit 9d6813d

Browse files
authored
Add python package size delta report feature for SMD images (#363)
1 parent 2daf404 commit 9d6813d

File tree

10 files changed

+471
-105
lines changed

10 files changed

+471
-105
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: Check Image Size
2+
on:
3+
# Manually call
4+
workflow_dispatch:
5+
inputs:
6+
image-version:
7+
required: true
8+
description: Image version=
9+
# Call from other workflow
10+
workflow_call:
11+
inputs:
12+
image-version:
13+
type: string
14+
required: true
15+
16+
defaults:
17+
run:
18+
shell: bash -l {0}
19+
20+
jobs:
21+
check-image-size:
22+
name: Run image size check
23+
runs-on: ubuntu-latest
24+
if: endsWith(github.repository, '/sagemaker-distribution')
25+
permissions:
26+
pull-requests: write
27+
contents: write
28+
steps:
29+
- uses: actions/checkout@v4
30+
- uses: mamba-org/setup-micromamba@v1
31+
with:
32+
environment-file: ./environment.yml
33+
environment-name: sagemaker-distribution
34+
init-shell: bash
35+
- name: Free up disk space
36+
run: rm -rf /opt/hostedtoolcache
37+
- name: Activate sagemaker-distribution
38+
run: micromamba activate sagemaker-distribution
39+
- name: Run size validation
40+
run: python ./src/main.py generate-size-report --target-patch-version ${{ inputs.image-version }} --validate

.github/workflows/monthly-minor-release.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,15 @@ jobs:
4343
with:
4444
release-type: "minor"
4545
base-version: ${{ matrix.version }}
46+
check-image-size:
47+
name: Check Image Size
48+
needs: start-monthly-minor
49+
permissions:
50+
pull-requests: write
51+
contents: write
52+
strategy:
53+
matrix: ${{ fromJson(needs.generate-version-matrix.outputs.matrix) }}
54+
fail-fast: false
55+
uses: aws/sagemaker-distribution/.github/workflows/check-image-size.yml@main
56+
with:
57+
base-version: ${{ matrix.version }}

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ repos:
1313
hooks:
1414
- id: autoflake
1515
args: ['--in-place', '--expand-star-imports', '--ignore-init-module-imports', '--remove-all-unused-imports']
16+
additional_dependencies: [setuptools]
1617
- repo: https://github.com/psf/black
1718
rev: 23.3.0
1819
hooks:

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,16 @@ VERSION=<Insert SageMaker Distribution version in semver format here. example: 0
5555
python ./src/main.py generate-staleness-report --target-patch-version $VERSION
5656
```
5757

58+
### Package Size Delta Report
59+
60+
If you want to generate/view the package size delta report for a given
61+
SageMaker distribution image version comparing to a base image version, then run the following command:
62+
63+
```
64+
BASE_PATCH_VERSION=<Insert SageMaker Distribution version of the base image in semver format here. example: 1.6.1>
65+
VERSION=<Insert SageMaker Distribution version of the target image in semver format here. example: 1.6.2>
66+
python ./src/main.py generate-size-report --base-patch-version $BASE_PATCH_VERSION --target-patch-version $VERSION
67+
```
5868

5969

6070
## Example use cases

src/main.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@
2020
_PATCH,
2121
_get_dependency_upper_bound_for_runtime_upgrade,
2222
)
23-
from package_staleness import generate_package_staleness_report
23+
from package_report import (
24+
generate_package_size_report,
25+
generate_package_staleness_report,
26+
)
2427
from release_notes_generator import generate_release_notes
2528
from utils import (
2629
get_dir_for_version,
@@ -399,6 +402,21 @@ def get_arg_parser():
399402
required=True,
400403
help="Specify the base patch version for which the package staleness report needs to be " "generated.",
401404
)
405+
package_size_parser = subparsers.add_parser(
406+
"generate-size-report",
407+
help="Generates package size report for each of the packages in the given " "image version.",
408+
)
409+
package_size_parser.set_defaults(func=generate_package_size_report)
410+
package_size_parser.add_argument(
411+
"--target-patch-version",
412+
required=True,
413+
help="Specify the target patch version for which the package size report needs to be " "generated.",
414+
)
415+
package_size_parser.add_argument(
416+
"--validate",
417+
action="store_true",
418+
help="Validate package size delta and raise error if the validation failed.",
419+
)
402420
return parser
403421

404422

src/package_report.py

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
import json
2+
import os
3+
from itertools import islice
4+
5+
import conda.cli.python_api
6+
from conda.models.match_spec import MatchSpec
7+
8+
from config import _image_generator_configs
9+
from dependency_upgrader import _dependency_metadata
10+
from utils import (
11+
create_markdown_table,
12+
get_dir_for_version,
13+
get_match_specs,
14+
get_semver,
15+
pull_conda_package_metadata,
16+
sizeof_fmt,
17+
)
18+
19+
20+
def _get_package_versions_in_upstream(target_packages_match_spec_out, target_version) -> dict[str, str]:
21+
package_to_version_mapping = {}
22+
is_major_version_release = target_version.minor == 0 and target_version.patch == 0
23+
is_minor_version_release = target_version.patch == 0 and not is_major_version_release
24+
for package in target_packages_match_spec_out:
25+
# Execute a conda search api call in the linux-64 subdirectory
26+
# packages such as pytorch-gpu are present only in linux-64 sub directory
27+
match_spec_out = target_packages_match_spec_out[package]
28+
package_version = str(match_spec_out.get("version")).removeprefix("==")
29+
package_version = get_semver(package_version)
30+
channel = match_spec_out.get("channel").channel_name
31+
subdir_filter = "[subdir=" + match_spec_out.get("subdir") + "]"
32+
search_result = conda.cli.python_api.run_command(
33+
"search", channel + "::" + package + ">=" + str(package_version) + subdir_filter, "--json"
34+
)
35+
# Load the first result as json. The API sends a json string inside an array
36+
package_metadata = json.loads(search_result[0])[package]
37+
# Response is of the structure
38+
# { 'package_name': [{'url':<someurl>, 'dependencies': <List of dependencies>, 'version':
39+
# <version number>}, ..., {'url':<someurl>, 'dependencies': <List of dependencies>, 'version':
40+
# <version number>}]
41+
# We only care about the version number in the last index
42+
package_version_in_conda = ""
43+
if is_major_version_release:
44+
latest_package_version_in_conda = package_metadata[-1]["version"]
45+
elif is_minor_version_release:
46+
package_major_version_prefix = str(package_version.major) + "."
47+
latest_package_version_in_conda = [
48+
x["version"] for x in package_metadata if x["version"].startswith(package_major_version_prefix)
49+
][-1]
50+
else:
51+
package_minor_version_prefix = ".".join([str(package_version.major), str(package_version.minor)]) + "."
52+
latest_package_version_in_conda = [
53+
x["version"] for x in package_metadata if x["version"].startswith(package_minor_version_prefix)
54+
][-1]
55+
56+
package_to_version_mapping[package] = latest_package_version_in_conda
57+
return package_to_version_mapping
58+
59+
60+
def _generate_staleness_report_per_image(
61+
package_versions_in_upstream, target_packages_match_spec_out, image_config, version
62+
):
63+
print("\n# Staleness Report: " + str(version) + "(" + image_config["image_type"] + ")\n")
64+
staleness_report_rows = []
65+
for package in package_versions_in_upstream:
66+
version_in_sagemaker_distribution = str(target_packages_match_spec_out[package].get("version")).removeprefix(
67+
"=="
68+
)
69+
package_string = (
70+
package
71+
if version_in_sagemaker_distribution == package_versions_in_upstream[package]
72+
else "${\color{red}" + package + "}$"
73+
)
74+
staleness_report_rows.append(
75+
{
76+
"package": package_string,
77+
"version_in_sagemaker_distribution": version_in_sagemaker_distribution,
78+
"latest_relavant_version": package_versions_in_upstream[package],
79+
}
80+
)
81+
print(
82+
create_markdown_table(
83+
["Package", "Current Version in the Distribution image", "Latest Relevant Version in " "Upstream"],
84+
staleness_report_rows,
85+
)
86+
)
87+
88+
89+
def _get_installed_package_versions_and_conda_versions(
90+
image_config, target_version_dir, target_version
91+
) -> (dict[str, MatchSpec], dict[str, str]):
92+
env_in_file_name = image_config["build_args"]["ENV_IN_FILENAME"]
93+
env_out_file_name = image_config["env_out_filename"]
94+
required_packages_from_target = get_match_specs(target_version_dir + "/" + env_in_file_name).keys()
95+
match_spec_out = get_match_specs(target_version_dir + "/" + env_out_file_name)
96+
# We only care about packages which are present in env.in
97+
# Remove Python from the dictionary, we don't want to track python version as part of our
98+
# staleness report.
99+
target_packages_match_spec_out = {
100+
k: v for k, v in match_spec_out.items() if k in required_packages_from_target and k not in _dependency_metadata
101+
}
102+
latest_package_versions_in_upstream = _get_package_versions_in_upstream(
103+
target_packages_match_spec_out, target_version
104+
)
105+
return target_packages_match_spec_out, latest_package_versions_in_upstream
106+
107+
108+
def _validate_new_package_size(new_package_total_size, target_total_size, image_type, target_version):
109+
# Validate if the new packages account for <= 5% of the total python package size of target image.
110+
new_package_total_size_percent_threshold = 5
111+
validate_result = None
112+
new_package_total_size_percent = round(new_package_total_size / target_total_size * 100, 2)
113+
new_package_total_size_percent_string = str(new_package_total_size_percent)
114+
if new_package_total_size_percent > new_package_total_size_percent_threshold:
115+
validate_result = (
116+
"The total size of newly introduced Python packages accounts for more than "
117+
+ str(new_package_total_size_percent_threshold)
118+
+ "% of the total Python package size of "
119+
+ image_type
120+
+ " image, version "
121+
+ str(target_version)
122+
+ "! ("
123+
+ str(new_package_total_size_percent)
124+
+ "%)"
125+
)
126+
new_package_total_size_percent_string = "${\color{red}" + str(new_package_total_size_percent) + "}$"
127+
128+
print(
129+
"The total size of newly introduced Python packages is "
130+
+ sizeof_fmt(new_package_total_size)
131+
+ ", accounts for "
132+
+ new_package_total_size_percent_string
133+
+ "% of the total package size."
134+
)
135+
return validate_result
136+
137+
138+
def _generate_python_package_size_report_per_image(
139+
base_pkg_metadata, target_pkg_metadata, image_config, base_version, target_version
140+
):
141+
validate_result = None
142+
image_type = image_config["image_type"].upper()
143+
print("\n# Python Package Size Report " + "(" + image_type + ")\n")
144+
print("\n### Target Image Version: " + str(target_version) + " | Base Image Version: " + str(base_version) + "\n")
145+
if not base_pkg_metadata or not base_version:
146+
print("WARNING: No Python package metadata file found for base image, only partial results will be shown.")
147+
base_total_size = sum(d["size"] for d in base_pkg_metadata.values()) if base_pkg_metadata else None
148+
149+
# Print out the total size change of all Python packages in the image.
150+
target_total_size = sum(d["size"] for d in target_pkg_metadata.values())
151+
total_size_delta_val = (target_total_size - base_total_size) if base_total_size else None
152+
total_size_delta_rel = (total_size_delta_val / base_total_size) if base_total_size else None
153+
print("\n## Python Packages Total Size Summary\n")
154+
print(
155+
create_markdown_table(
156+
["Target Version Total Size", "Base Version Total Size", "Size Change (abs)", "Size Change (%)"],
157+
[
158+
{
159+
"target_total_size": sizeof_fmt(target_total_size),
160+
"base_total_size": sizeof_fmt(base_total_size) if base_total_size else "-",
161+
"size_delta_val": sizeof_fmt(total_size_delta_val) if total_size_delta_val else "-",
162+
"size_delta_rel": str(round(total_size_delta_rel * 100, 2)) if total_size_delta_rel else "-",
163+
}
164+
],
165+
)
166+
)
167+
168+
# Print out the largest 20 Python packages in the image, sorted decending by size.
169+
print("\n## Top-20 Largest Python Packages\n")
170+
print(
171+
create_markdown_table(
172+
["Package", "Version in the Target Image", "Size"],
173+
[
174+
{"pkg": k, "version": v["version"], "size": sizeof_fmt(v["size"])}
175+
for k, v in islice(target_pkg_metadata.items(), None, 20)
176+
],
177+
)
178+
)
179+
180+
# Print out the size delta for each changed/new package in the image, sorted decending by size.
181+
if base_pkg_metadata:
182+
print("\n## Python Package Size Delta\n")
183+
new_package_total_size = 0
184+
package_size_delta_list = []
185+
for k, v in target_pkg_metadata.items():
186+
if k not in base_pkg_metadata or base_pkg_metadata[k]["version"] != v["version"]:
187+
base_pkg_size = base_pkg_metadata[k]["size"] if k in base_pkg_metadata else 0
188+
size_delta_abs = v["size"] - base_pkg_size
189+
package_size_delta_list.append(
190+
{
191+
"package": k,
192+
"target_version": v["version"],
193+
"base_version": base_pkg_metadata[k]["version"] if k in base_pkg_metadata else "-",
194+
"size_delta_abs": size_delta_abs,
195+
"size_delta_rel": (size_delta_abs / base_pkg_size) if base_pkg_size else None,
196+
}
197+
)
198+
if k not in base_pkg_metadata:
199+
new_package_total_size += v["size"]
200+
# Sort the package size delta based on absolute size diff in decending order.
201+
package_size_delta_list = sorted(package_size_delta_list, key=lambda item: item["size_delta_abs"], reverse=True)
202+
for v in package_size_delta_list:
203+
v["size_delta_rel"] = str(round(v["size_delta_rel"] * 100, 2)) if v["size_delta_rel"] else "-"
204+
v["size_delta_abs"] = sizeof_fmt(v["size_delta_abs"])
205+
206+
validate_result = _validate_new_package_size(
207+
new_package_total_size, target_total_size, image_type, target_version
208+
)
209+
print(
210+
create_markdown_table(
211+
[
212+
"Package",
213+
"Version in the Target Image",
214+
"Version in the Base Image",
215+
"Size Change (abs)",
216+
"Size Change (%)",
217+
],
218+
package_size_delta_list,
219+
)
220+
)
221+
return validate_result
222+
223+
224+
def generate_package_staleness_report(args):
225+
target_version = get_semver(args.target_patch_version)
226+
target_version_dir = get_dir_for_version(target_version)
227+
for image_config in _image_generator_configs:
228+
(
229+
target_packages_match_spec_out,
230+
latest_package_versions_in_upstream,
231+
) = _get_installed_package_versions_and_conda_versions(image_config, target_version_dir, target_version)
232+
_generate_staleness_report_per_image(
233+
latest_package_versions_in_upstream, target_packages_match_spec_out, image_config, target_version
234+
)
235+
236+
237+
def generate_package_size_report(args):
238+
target_version = get_semver(args.target_patch_version)
239+
target_version_dir = get_dir_for_version(target_version)
240+
241+
base_version = None
242+
source_version_txt_file_path = f"{target_version_dir}/source-version.txt"
243+
if os.path.exists(source_version_txt_file_path):
244+
with open(source_version_txt_file_path, "r") as f:
245+
source_patch_version = f.readline()
246+
base_version = get_semver(source_patch_version)
247+
base_version_dir = get_dir_for_version(base_version) if base_version else None
248+
validate_results = []
249+
for image_config in _image_generator_configs:
250+
base_pkg_metadata = pull_conda_package_metadata(image_config, base_version_dir) if base_version else None
251+
target_pkg_metadata = pull_conda_package_metadata(image_config, target_version_dir)
252+
253+
validate_result = _generate_python_package_size_report_per_image(
254+
base_pkg_metadata, target_pkg_metadata, image_config, base_version, target_version
255+
)
256+
if validate_result:
257+
validate_results.append(validate_result)
258+
259+
if args.validate:
260+
if validate_results:
261+
raise Exception(f"Size Validation Failed! Issues found: {validate_results}")
262+
print("Pakcage Size Validation Passed!")

0 commit comments

Comments
 (0)