Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add recovery mode setup #543

Merged
merged 14 commits into from
Jan 13, 2025
8 changes: 8 additions & 0 deletions template/v2/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ ENV STUDIO_LOGGING_DIR="/var/log/studio/"
ENV EDITOR="nano"
ENV IMAGE_VERSION=$IMAGE_VERSION
ENV PINNED_MICROMAMBA_MINOR_VERSION="1.5.*"
ENV SAGEMAKER_RECOVERY_MODE_HOME=/tmp/sagemaker-recovery-mode-home

USER root
# Upgrade micromamba to the latest patch version in the pinned minor version range, if applicable
Expand Down Expand Up @@ -91,6 +92,13 @@ RUN if [[ -z $ARG_BASED_ENV_IN_FILENAME ]] ; \
fi && \
# Enforce dependencies are all installed from conda-forge
micromamba install -y --name base --file /tmp/$ENV_IN_FILENAME && \
mkdir -p $SAGEMAKER_RECOVERY_MODE_HOME && \
chown $MAMBA_USER:$MAMBA_USER $SAGEMAKER_RECOVERY_MODE_HOME && \
JUPYTERLAB_VERSION=$(grep "^conda-forge::jupyterlab\[" /tmp/$ENV_IN_FILENAME) && \
SAGEMAKER_JUPYTERLAB_VERSION=$(grep "^conda-forge::sagemaker-jupyterlab-extension" /tmp/$ENV_IN_FILENAME) && \
echo "Installing in sagemaker-recovery-mode micromamba environment: $JUPYTERLAB_VERSION $SAGEMAKER_JUPYTERLAB_VERSION" && \
micromamba create -n sagemaker-recovery-mode && \
micromamba install -n sagemaker-recovery-mode -y $JUPYTERLAB_VERSION $SAGEMAKER_JUPYTERLAB_VERSION && \
micromamba clean --all --yes --force-pkgs-dirs && \
rm -rf /tmp/*.in && \
sudo ln -s $(which python3) /usr/bin/python && \
Expand Down
10 changes: 8 additions & 2 deletions template/v2/dirs/usr/local/bin/entrypoint-jupyter-server
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@ set -e
# micromamba commands (e.g. using `micromamba activate` to activate environments)
eval "$(micromamba shell hook --shell=bash)"

# Activate conda environment 'base', where supervisord is installed
micromamba activate base
if [ -n "$SAGEMAKER_RECOVERY_MODE" ]; then
export HOME=$SAGEMAKER_RECOVERY_MODE
# Activate conda environment `sagemaker-recovery-mode`
micromamba activate sagemaker-recovery-mode
else
# Activate conda environment 'base'
micromamba activate base
fi

# Set up SAGEMAKER_APP_TYPE_LOWERCASE based on SAGEMAKER_APP_TYPE
export SAGEMAKER_APP_TYPE_LOWERCASE=$(echo $SAGEMAKER_APP_TYPE | tr '[:upper:]' '[:lower:]')
Expand Down
10 changes: 8 additions & 2 deletions template/v2/dirs/usr/local/bin/start-jupyter-server
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@ set -e

eval "$(micromamba shell hook --shell=bash)"

# Activate conda environment 'base', which is the default environment for Cosmos
micromamba activate base
if [ -n "$SAGEMAKER_RECOVERY_MODE" ]; then
export HOME=$SAGEMAKER_RECOVERY_MODE
# Activate conda environment `sagemaker-recovery-mode`
micromamba activate sagemaker-recovery-mode
else
# Activate conda environment 'base'
micromamba activate base
fi

# Start Jupyter server in rtc mode for shared spaces
if [ -n "$SAGEMAKER_APP_TYPE_LOWERCASE" ] && [ "$SAGEMAKER_SPACE_TYPE_LOWERCASE" == "shared" ]; then
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ARG SAGEMAKER_DISTRIBUTION_IMAGE
FROM $SAGEMAKER_DISTRIBUTION_IMAGE

ARG MAMBA_DOCKERFILE_ACTIVATE=1

ENV SAGEMAKER_RECOVERY_MODE=true

ENTRYPOINT ["/usr/local/bin/entrypoint-jupyter-server"]
152 changes: 127 additions & 25 deletions test/test_dockerfile_based_harness.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import subprocess
import time
from typing import List

import docker
Expand All @@ -21,7 +22,10 @@
("autogluon.test.Dockerfile", ["autogluon"]),
("matplotlib.test.Dockerfile", ["matplotlib"]),
("matplotlib.test.Dockerfile", ["matplotlib-base"]),
("sagemaker-headless-execution-driver.test.Dockerfile", ["sagemaker-headless-execution-driver"]),
(
"sagemaker-headless-execution-driver.test.Dockerfile",
["sagemaker-headless-execution-driver"],
),
("scipy.test.Dockerfile", ["scipy"]),
("numpy.test.Dockerfile", ["numpy"]),
("boto3.test.Dockerfile", ["boto3"]),
Expand All @@ -38,20 +42,36 @@
("notebook.test.Dockerfile", ["notebook"]),
("glue-sessions.test.Dockerfile", ["aws-glue-sessions"]),
("altair.test.Dockerfile", ["altair"]),
("sagemaker-studio-analytics-extension.test.Dockerfile", ["sagemaker-studio-analytics-extension"]),
("amazon-codewhisperer-jupyterlab-ext.test.Dockerfile", ["amazon-codewhisperer-jupyterlab-ext"]),
(
"sagemaker-studio-analytics-extension.test.Dockerfile",
["sagemaker-studio-analytics-extension"],
),
(
"amazon-codewhisperer-jupyterlab-ext.test.Dockerfile",
["amazon-codewhisperer-jupyterlab-ext"],
),
("jupyterlab-git.test.Dockerfile", ["jupyterlab-git"]),
("amazon-sagemaker-sql-magic.test.Dockerfile", ["amazon-sagemaker-sql-magic"]),
("amazon_sagemaker_sql_editor.test.Dockerfile", ["amazon_sagemaker_sql_editor"]),
(
"amazon_sagemaker_sql_editor.test.Dockerfile",
["amazon_sagemaker_sql_editor"],
),
("serve.test.Dockerfile", ["langchain"]),
("langchain-aws.test.Dockerfile", ["langchain-aws"]),
("mlflow.test.Dockerfile", ["mlflow"]),
("jupyter-activity-monitor-extension.test.Dockerfile", ["jupyter-activity-monitor-extension"]),
(
"jupyter-activity-monitor-extension.test.Dockerfile",
["jupyter-activity-monitor-extension"],
),
("docker-cli.test.Dockerfile", ["docker-cli"]),
("sagemaker-recovery-mode.test.Dockerfile", ["sagemaker-jupyterlab-extension"]),
],
)
def test_dockerfiles_for_cpu(
dockerfile_path: str, required_packages: List[str], local_image_version: str, use_gpu: bool
dockerfile_path: str,
required_packages: List[str],
local_image_version: str,
use_gpu: bool,
):
_validate_docker_images(dockerfile_path, required_packages, local_image_version, use_gpu, "cpu")

Expand All @@ -64,7 +84,10 @@ def test_dockerfiles_for_cpu(
("autogluon.test.Dockerfile", ["autogluon"]),
("matplotlib.test.Dockerfile", ["matplotlib"]),
("matplotlib.test.Dockerfile", ["matplotlib-base"]),
("sagemaker-headless-execution-driver.test.Dockerfile", ["sagemaker-headless-execution-driver"]),
(
"sagemaker-headless-execution-driver.test.Dockerfile",
["sagemaker-headless-execution-driver"],
),
("scipy.test.Dockerfile", ["scipy"]),
("numpy.test.Dockerfile", ["numpy"]),
("boto3.test.Dockerfile", ["boto3"]),
Expand All @@ -81,22 +104,38 @@ def test_dockerfiles_for_cpu(
("notebook.test.Dockerfile", ["notebook"]),
("glue-sessions.test.Dockerfile", ["aws-glue-sessions"]),
("altair.test.Dockerfile", ["altair"]),
("sagemaker-studio-analytics-extension.test.Dockerfile", ["sagemaker-studio-analytics-extension"]),
("amazon-codewhisperer-jupyterlab-ext.test.Dockerfile", ["amazon-codewhisperer-jupyterlab-ext"]),
(
"sagemaker-studio-analytics-extension.test.Dockerfile",
["sagemaker-studio-analytics-extension"],
),
(
"amazon-codewhisperer-jupyterlab-ext.test.Dockerfile",
["amazon-codewhisperer-jupyterlab-ext"],
),
("jupyterlab-git.test.Dockerfile", ["jupyterlab-git"]),
("amazon-sagemaker-sql-magic.test.Dockerfile", ["amazon-sagemaker-sql-magic"]),
("amazon_sagemaker_sql_editor.test.Dockerfile", ["amazon_sagemaker_sql_editor"]),
(
"amazon_sagemaker_sql_editor.test.Dockerfile",
["amazon_sagemaker_sql_editor"],
),
("serve.test.Dockerfile", ["langchain"]),
("langchain-aws.test.Dockerfile", ["langchain-aws"]),
("mlflow.test.Dockerfile", ["mlflow"]),
("sagemaker-mlflow.test.Dockerfile", ["sagemaker-mlflow"]),
("jupyter-activity-monitor-extension.test.Dockerfile", ["jupyter-activity-monitor-extension"]),
(
"jupyter-activity-monitor-extension.test.Dockerfile",
["jupyter-activity-monitor-extension"],
),
("gpu-dependencies.test.Dockerfile", ["pytorch", "tensorflow"]),
("docker-cli.test.Dockerfile", ["docker-cli"]),
("sagemaker-recovery-mode.test.Dockerfile", ["sagemaker-jupyterlab-extension"]),
],
)
def test_dockerfiles_for_gpu(
dockerfile_path: str, required_packages: List[str], local_image_version: str, use_gpu: bool
dockerfile_path: str,
required_packages: List[str],
local_image_version: str,
use_gpu: bool,
):
_validate_docker_images(dockerfile_path, required_packages, local_image_version, use_gpu, "gpu")

Expand Down Expand Up @@ -138,7 +177,11 @@ def _check_required_package_constraints(target_version: Version, required_packag


def _validate_docker_images(
dockerfile_path: str, required_packages: List[str], local_image_version: str, use_gpu: bool, image_type: str
dockerfile_path: str,
required_packages: List[str],
local_image_version: str,
use_gpu: bool,
image_type: str,
):
target_version = get_semver(local_image_version)
test_artifacts_path = f"test/test_artifacts/v{str(target_version.major)}"
Expand Down Expand Up @@ -181,15 +224,74 @@ def _validate_docker_images(
# didn't execute successfully, the Docker client below will throw an error and fail the test.
# A consequence of this design decision is that any test assertions should go inside the container's entry-point.

container = _docker_client.containers.run(image=image.id, detach=True, stderr=True, device_requests=device_requests)
# Wait till container completes execution
result = container.wait()
exit_code = result["StatusCode"]
if exit_code != 0:
# Print STD out only during test failure
print(container.logs().decode("utf-8"))
# Remove the container.
container.remove(force=True)
_docker_client.images.remove(image=image.id, force=True)
# Fail the test if docker exit code is not zero
assert exit_code == 0
# Special handling for JupyterLab entrypoint testing
if dockerfile_path in ["recovery-mode.test.Dockerfile"]:
_test_jupyterlab_entrypoint(image)
else:
container = _docker_client.containers.run(
image=image.id, detach=True, stderr=True, device_requests=device_requests
)
# Wait till container completes execution
result = container.wait()
exit_code = result["StatusCode"]
if exit_code != 0:
# Print STD out only during test failure
print(container.logs().decode("utf-8"))
# Remove the container.
container.remove(force=True)
_docker_client.images.remove(image=image.id, force=True)
# Fail the test if docker exit code is not zero
assert exit_code == 0


def _test_jupyterlab_entrypoint(image):
"""
Test if the Docker image's entrypoint successfully starts the JupyterLab process.
This test assumes that the container will remain in a long-running state if JupyterLab starts successfully.
"""
print("Starting test to verify JupyterLab can be started...")
# Start the container in detached mode
container = _docker_client.containers.run(
image=image.id,
detach=True,
stderr=True,
)
try:
# Wait for the container logs to indicate JupyterLab has started
_wait_for_logs(container, "jupyterlabserver entered RUNNING state", timeout=5)
print("Container logs indicate JupyterLab started successfully.")

except Exception as e:
# Print logs and re-raise exception if the test fails
print(f"Test failed: {e}")
logs = container.logs().decode("utf-8")
print("Container logs:")
print(logs)
raise
finally:
# Stop and clean up the container
container.stop()
container.remove()
print("Stopped and removed the container.")


def _wait_for_logs(container, search_string, timeout=5, poll_interval=1):
"""
Wait for a specific string to appear in the container logs within a given timeout.

Args:
container: The container to monitor.
search_string: The string to search for in the logs.
timeout: Maximum time to wait for the string to appear (in seconds).
poll_interval: Time to wait between log checks (in seconds).

Raises:
TimeoutError: If the string does not appear in the logs within the timeout.
"""
start_time = time.time()
while time.time() - start_time < timeout:
logs = container.logs().decode("utf-8")
if search_string in logs:
return True
time.sleep(poll_interval)
raise TimeoutError(f"Container did not log '{search_string}' within {timeout} seconds.")
Loading