From 36cfbf22689a26fc646cbd0643eefce001dfa452 Mon Sep 17 00:00:00 2001 From: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Date: Tue, 13 Apr 2021 01:08:33 +0200 Subject: [PATCH] Sagemaker test docs update for framework upgrade (#11206) * increased train_runtime for model parallelism * added documentation for framework upgrade --- tests/sagemaker/README.md | 6 ++---- tests/sagemaker/test_multi_node_model_parallel.py | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/sagemaker/README.md b/tests/sagemaker/README.md index 3d8ab7c2bfe02c..e6675c190b31ac 100644 --- a/tests/sagemaker/README.md +++ b/tests/sagemaker/README.md @@ -66,8 +66,7 @@ images: ``` 2. In the PR comment describe what test, we ran and with which package versions. Here you can copy the table from [Current Tests](#current-tests). -TODO: Add a screenshot of PR + Text template to make it easy to open. - +2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). You can take a look at this [PR](https://github.com/aws/deep-learning-containers/pull/1016), which information are needed. ## Test Case 2: Releasing a New AWS Framework DLC @@ -92,7 +91,6 @@ AWS_PROFILE= make test-sagemaker ``` These tests take around 10-15 minutes to finish. Preferably make a screenshot of the successfully ran tests. - ### After successful Tests: After we have successfully run tests for the new framework version we need to create a PR at the [Deep Learning Container Repository](https://github.com/aws/deep-learning-containers). @@ -136,7 +134,7 @@ images: docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] ``` -2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). You can take a look at this [PR](https://github.com/aws/deep-learning-containers/pull/1016), which information are needed. +2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). You can take a look at this [PR](https://github.com/aws/deep-learning-containers/pull/1025), which information are needed. ## Current Tests diff --git a/tests/sagemaker/test_multi_node_model_parallel.py b/tests/sagemaker/test_multi_node_model_parallel.py index a59c207fb0edf9..bd66e68eedd1b3 100644 --- a/tests/sagemaker/test_multi_node_model_parallel.py +++ b/tests/sagemaker/test_multi_node_model_parallel.py @@ -28,14 +28,14 @@ "script": "run_glue_model_parallelism.py", "model_name_or_path": "roberta-large", "instance_type": "ml.p3dn.24xlarge", - "results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2}, + "results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2}, }, { "framework": "pytorch", "script": "run_glue.py", "model_name_or_path": "roberta-large", "instance_type": "ml.p3dn.24xlarge", - "results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2}, + "results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2}, }, ] )