From 9b1bf7f7b6751d3401f3a6e2a65c45be79c453e7 Mon Sep 17 00:00:00 2001 From: Tulsi Shah Date: Fri, 24 Nov 2023 09:19:46 +0000 Subject: [PATCH 1/5] upgrading go version --- Dockerfile | 2 +- perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh | 2 +- perfmetrics/scripts/ml_tests/setup.sh | 2 +- .../scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh | 2 +- perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh | 2 +- perfmetrics/scripts/run_e2e_tests.sh | 2 +- tools/cd_scripts/e2e_test.sh | 2 +- tools/containerize_gcsfuse_docker/Dockerfile | 2 +- tools/package_gcsfuse_docker/Dockerfile | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index afc1cbc112..3d54c04048 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ # Mount the gcsfuse to /mnt/gcs: # > docker run --privileged --device /fuse -v /mnt/gcs:/gcs:rw,rshared gcsfuse -FROM golang:1.21.3-alpine as builder +FROM golang:1.21.4-alpine as builder RUN apk add git diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh b/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh index 1952c1515d..cc856e1ce8 100755 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh +++ b/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh @@ -1,7 +1,7 @@ #!/bin/bash # Install golang -wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-amd64.tar.gz -q +wget -O go_tar.tar.gz https://go.dev/dl/go1.21.4.linux-amd64.tar.gz -q rm -rf /usr/local/go && tar -C /usr/local -xzf go_tar.tar.gz export PATH=$PATH:/usr/local/go/bin diff --git a/perfmetrics/scripts/ml_tests/setup.sh b/perfmetrics/scripts/ml_tests/setup.sh index 532da67098..2635d752cd 100755 --- a/perfmetrics/scripts/ml_tests/setup.sh +++ b/perfmetrics/scripts/ml_tests/setup.sh @@ -4,7 +4,7 @@ # >> source setup.sh # Go version to be installed. -GO_VERSION=go1.21.3.linux-amd64.tar.gz +GO_VERSION=go1.21.4.linux-amd64.tar.gz # This function will install the given module/dependency if it's not alredy # installed. diff --git a/perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh b/perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh index e0f655500e..53e7638497 100755 --- a/perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh +++ b/perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh @@ -5,7 +5,7 @@ # and epochs functionality, and runs the model # Install go lang -wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-amd64.tar.gz -q +wget -O go_tar.tar.gz https://go.dev/dl/go1.21.4.linux-amd64.tar.gz -q sudo rm -rf /usr/local/go && tar -xzf go_tar.tar.gz && sudo mv go /usr/local export PATH=$PATH:/usr/local/go/bin diff --git a/perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh b/perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh index cbfb9b84b1..4f1299cc1e 100755 --- a/perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh +++ b/perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh @@ -34,7 +34,7 @@ set -e sudo apt-get update echo Installing git sudo apt-get install git -echo Installing go-lang 1.21.3 +echo Installing go-lang 1.21.4 wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-amd64.tar.gz -q sudo rm -rf /usr/local/go && tar -xzf go_tar.tar.gz && sudo mv go /usr/local export PATH=$PATH:/usr/local/go/bin diff --git a/perfmetrics/scripts/run_e2e_tests.sh b/perfmetrics/scripts/run_e2e_tests.sh index 61775f5538..c3b16d7c58 100755 --- a/perfmetrics/scripts/run_e2e_tests.sh +++ b/perfmetrics/scripts/run_e2e_tests.sh @@ -27,7 +27,7 @@ run_e2e_tests_on_package=$1 # e.g. architecture=arm64 or amd64 architecture=$(dpkg --print-architecture) -echo "Installing go-lang 1.21.3..." +echo "Installing go-lang 1.21.4..." wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-${architecture}.tar.gz -q sudo rm -rf /usr/local/go && tar -xzf go_tar.tar.gz && sudo mv go /usr/local export PATH=$PATH:/usr/local/go/bin diff --git a/tools/cd_scripts/e2e_test.sh b/tools/cd_scripts/e2e_test.sh index 1051ad2b6d..7be203ecca 100755 --- a/tools/cd_scripts/e2e_test.sh +++ b/tools/cd_scripts/e2e_test.sh @@ -111,7 +111,7 @@ else fi # install go -wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-${architecture}.tar.gz +wget -O go_tar.tar.gz https://go.dev/dl/go1.21.4.linux-${architecture}.tar.gz sudo tar -C /usr/local -xzf go_tar.tar.gz export PATH=${PATH}:/usr/local/go/bin #Write gcsfuse and go version to log file diff --git a/tools/containerize_gcsfuse_docker/Dockerfile b/tools/containerize_gcsfuse_docker/Dockerfile index dfbee7f5cc..8cb0cc0199 100644 --- a/tools/containerize_gcsfuse_docker/Dockerfile +++ b/tools/containerize_gcsfuse_docker/Dockerfile @@ -34,7 +34,7 @@ ARG OS_VERSION ARG OS_NAME # Image with gcsfuse installed and its package (.deb) -FROM golang:1.21.3 as gcsfuse-package +FROM golang:1.21.4 as gcsfuse-package RUN apt-get update -qq && apt-get install -y ruby ruby-dev rubygems build-essential rpm fuse && gem install --no-document bundler diff --git a/tools/package_gcsfuse_docker/Dockerfile b/tools/package_gcsfuse_docker/Dockerfile index 5bbe34fecf..9bf415c530 100644 --- a/tools/package_gcsfuse_docker/Dockerfile +++ b/tools/package_gcsfuse_docker/Dockerfile @@ -17,7 +17,7 @@ # Copy the gcsfuse packages to the host: # > docker run -it -v /tmp:/output gcsfuse-release cp -r /packages /output -FROM golang:1.21.3 as builder +FROM golang:1.21.4 as builder RUN apt-get update -qq && apt-get install -y ruby ruby-dev rubygems build-essential rpm && gem install --no-document bundler From 9effd77bd07d96c7e76be339a7dfbc4c5bc261d9 Mon Sep 17 00:00:00 2001 From: Tulsi Shah Date: Mon, 27 Nov 2023 04:36:38 +0000 Subject: [PATCH 2/5] upgrade go version --- perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh | 2 +- perfmetrics/scripts/run_e2e_tests.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh b/perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh index 4f1299cc1e..4f8f74da7a 100755 --- a/perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh +++ b/perfmetrics/scripts/presubmit_test/pr_perf_test/build.sh @@ -35,7 +35,7 @@ sudo apt-get update echo Installing git sudo apt-get install git echo Installing go-lang 1.21.4 -wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-amd64.tar.gz -q +wget -O go_tar.tar.gz https://go.dev/dl/go1.21.4.linux-amd64.tar.gz -q sudo rm -rf /usr/local/go && tar -xzf go_tar.tar.gz && sudo mv go /usr/local export PATH=$PATH:/usr/local/go/bin export CGO_ENABLED=0 diff --git a/perfmetrics/scripts/run_e2e_tests.sh b/perfmetrics/scripts/run_e2e_tests.sh index c3b16d7c58..a92b0d42c8 100755 --- a/perfmetrics/scripts/run_e2e_tests.sh +++ b/perfmetrics/scripts/run_e2e_tests.sh @@ -28,7 +28,7 @@ run_e2e_tests_on_package=$1 # e.g. architecture=arm64 or amd64 architecture=$(dpkg --print-architecture) echo "Installing go-lang 1.21.4..." -wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-${architecture}.tar.gz -q +wget -O go_tar.tar.gz https://go.dev/dl/go1.21.4.linux-${architecture}.tar.gz -q sudo rm -rf /usr/local/go && tar -xzf go_tar.tar.gz && sudo mv go /usr/local export PATH=$PATH:/usr/local/go/bin # install python3-setuptools tools. From 65ddfc86a4d15a4b6c01c2ad3a8af0faadf86eb4 Mon Sep 17 00:00:00 2001 From: Tulsi Shah <46474643+Tulsishah@users.noreply.github.com> Date: Mon, 27 Nov 2023 10:18:12 +0530 Subject: [PATCH 3/5] Pytorch 2 support (#1513) * adding testing statements * adding scripts to run model with pytorch2.0 * small fixes and adding licence * dino model change to support pytorch 2.0 * dino model change to support pytorch 2.0 * small fix * refactoring * refactoring * refactoring * small fix * small fix * adding comments * fixing comments * fixing comments * fixing comments * removing python version * removing python version * removing python version --- .../ml_tests/pytorch/dino/build.sh | 14 ------ .../ml_tests/pytorch/dino/continuous.cfg | 4 -- .../ml_tests/pytorch/v1_12/dino/build.sh | 27 ++++++++++ .../pytorch/v1_12/dino/continuous.cfg | 18 +++++++ .../ml_tests/pytorch/v2/dino/build.sh | 27 ++++++++++ .../ml_tests/pytorch/v2/dino/continuous.cfg | 18 +++++++ .../ml_tests/run_and_manage_test.sh | 50 ++++++++++++------- .../pytorch/{dino => }/README-usage.md | 4 +- .../pytorch/dino/setup_host_and_run_model.sh | 25 ---------- .../scripts/ml_tests/pytorch/run_container.sh | 32 ++++++++++++ .../{dino/setup_container.sh => run_model.sh} | 21 ++++++-- .../pytorch/{ => v1_12}/dino/Dockerfile | 6 ++- .../dino/setup_host_and_run_container.sh | 27 ++++++++++ .../ml_tests/pytorch/v2/dino/Dockerfile | 46 +++++++++++++++++ .../v2/dino/setup_host_and_run_container.sh | 27 ++++++++++ perfmetrics/scripts/ml_tests/setup_host.sh | 3 +- 16 files changed, 280 insertions(+), 69 deletions(-) delete mode 100755 perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh delete mode 100644 perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg create mode 100755 perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh create mode 100644 perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/continuous.cfg create mode 100755 perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh create mode 100644 perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/continuous.cfg rename perfmetrics/scripts/ml_tests/pytorch/{dino => }/README-usage.md (93%) delete mode 100755 perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh create mode 100644 perfmetrics/scripts/ml_tests/pytorch/run_container.sh rename perfmetrics/scripts/ml_tests/pytorch/{dino/setup_container.sh => run_model.sh} (80%) rename perfmetrics/scripts/ml_tests/pytorch/{ => v1_12}/dino/Dockerfile (86%) create mode 100755 perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh create mode 100644 perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile create mode 100755 perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh deleted file mode 100755 index 93c106c6e6..0000000000 --- a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# This will stop execution when any command will have non-zero status. -set -e - -VM_NAME="pytorch-dino-7d" -ZONE_NAME="us-west1-b" -ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/dino" -TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh" - -cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" - -source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH - diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg deleted file mode 100644 index 3dc0813d14..0000000000 --- a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg +++ /dev/null @@ -1,4 +0,0 @@ -build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh" - -# 2 hours timeout. -timeout_mins: 60 diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh new file mode 100755 index 0000000000..3f74686208 --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +VM_NAME="pytorch-dino-7d" +ZONE_NAME="us-west1-b" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v1_12/dino" +TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh" +PYTORCH_VERSION="v1_12" + +cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" + +source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/continuous.cfg new file mode 100644 index 0000000000..184cdb9442 --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/continuous.cfg @@ -0,0 +1,18 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh" + +# 2 hours timeout. +timeout_mins: 60 diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh new file mode 100755 index 0000000000..9f1b11a953 --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +VM_NAME="pytorch2-dino-7d" +ZONE_NAME="us-west1-a" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino" +TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh" +PYTORCH_VERSION="v2" + +cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" + +source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/continuous.cfg new file mode 100644 index 0000000000..8d3d851ddb --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/continuous.cfg @@ -0,0 +1,18 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh" + +# 2 hours timeout. +timeout_mins: 60 diff --git a/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh b/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh index 0e7e46c906..3c0b9b7be0 100755 --- a/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh +++ b/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh @@ -27,6 +27,11 @@ ZONE_NAME=$2 ARTIFACTS_BUCKET_PATH=$3 # Path of test script relative to $HOME inside test VM. TEST_SCRIPT_PATH=$4 +# pytorch version +PYTORCH_VERSION=$5 +MACHINE_TYPE="a2-highgpu-2g" +ACCELERATOR="count=2,type=nvidia-tesla-a100" +RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus" function initialize_ssh_key () { echo "Delete existing ssh keys " @@ -55,26 +60,35 @@ function delete_existing_vm_and_create_new () { echo "Wait for 30 seconds for old VM to be deleted" sleep 30s + # NVIDIA A100 40GB GPU type machine is currently unavailable due to global shortage. + # Create NVIDIA L4 machines which are available on us-west1-1 zone. + if [ $PYTORCH_VERSION == "v2" ]; + then + MACHINE_TYPE="g2-standard-24" + ACCELERATOR="count=2,type=nvidia-l4" + RESERVATION="projects/$GCP_PROJECT/reservations/pytorch2-ai-ml-tests" + fi + echo "Creating VM $VM_NAME in zone $ZONE_NAME" # The below command creates VM using the reservation 'ai-ml-tests' sudo gcloud compute instances create $VM_NAME \ - --project=$GCP_PROJECT\ - --zone=$ZONE_NAME \ - --machine-type=a2-highgpu-2g \ - --network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \ - --metadata=enable-osconfig=TRUE,enable-oslogin=true \ - --maintenance-policy=TERMINATE \ - --provisioning-model=STANDARD \ - --service-account=927584127901-compute@developer.gserviceaccount.com \ - --scopes=https://www.googleapis.com/auth/cloud-platform \ - --accelerator=count=2,type=nvidia-tesla-a100 \ - --create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20230616,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \ - --no-shielded-secure-boot \ - --shielded-vtpm \ - --shielded-integrity-monitoring \ - --labels=goog-ops-agent-policy=v2-x86-template-1-0-0,goog-ec-src=vm_add-gcloud \ - --reservation-affinity=specific \ - --reservation=projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus + --project=$GCP_PROJECT\ + --zone=$ZONE_NAME \ + --machine-type=$MACHINE_TYPE \ + --network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \ + --metadata=enable-osconfig=TRUE,enable-oslogin=true \ + --maintenance-policy=TERMINATE \ + --provisioning-model=STANDARD \ + --service-account=927584127901-compute@developer.gserviceaccount.com \ + --scopes=https://www.googleapis.com/auth/cloud-platform \ + --accelerator=$ACCELERATOR \ + --create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20230616,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \ + --no-shielded-secure-boot \ + --shielded-vtpm \ + --shielded-integrity-monitoring \ + --labels=goog-ops-agent-policy=v2-x86-template-1-0-0,goog-ec-src=vm_add-gcloud \ + --reservation-affinity=specific \ + --reservation=$RESERVATION echo "Wait for 30 seconds for new VM to be initialised" sleep 30s @@ -132,7 +146,7 @@ exit_status=0 # Transitions: # START to START: If model run is not triggerred due to some error. # START to RUNNING: If model is successfully triggerred on GPU. This state is -# changed by setup_host.sh that runs inside docker container of test VM. +# changed by setup_host.sh that runs inside docker container of test VM. if [ $current_status == "START" ]; then echo "Update commit Id for the run" diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/README-usage.md b/perfmetrics/scripts/ml_tests/pytorch/README-usage.md similarity index 93% rename from perfmetrics/scripts/ml_tests/pytorch/dino/README-usage.md rename to perfmetrics/scripts/ml_tests/pytorch/README-usage.md index 1364c7097f..014eef20c2 100644 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/README-usage.md +++ b/perfmetrics/scripts/ml_tests/pytorch/README-usage.md @@ -16,7 +16,7 @@ curl, ca-certificates, lsb-release etc. This script contains the instruction to install gcsfuse, mount GCS-bucket using gcsfuse, and finally runs the pytorch dino model. -### File: perfmetrics/scripts/continuous_test/pytorch/dino/build.sh +### File: perfmetrics/scripts/continuous_test/pytorch/{v1_12 or v2}/dino/build.sh This is the parent script of the above two scripts. Firstly, it sets-up the host machine after that it creates the docker-image and finally it runs the container with the inststructions written in the setup_container.sh. @@ -40,6 +40,6 @@ log.txt - Contains the model learning parameter value after each epoch. variable - with current working directory. 3. Create a folder named "github" and clone the gcsfuse repo in that. 4. Run the below script in the current working directory: - **source github/gcsfuse/permetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh** + **source github/gcsfuse/permetrics/scripts/continuous_test/ml_tests/pytorch/{v1_12 or v2}/dino/build.sh** 5. The above command first setups the host and then start running the model inside container. diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh b/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh deleted file mode 100755 index 3c710f2e14..0000000000 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# This will stop execution when any command will have non-zero status. -set -e - -cd "$HOME/github/gcsfuse/perfmetrics/scripts" - -echo "Setting up the machine with Docker and Nvidia Driver" -source ml_tests/setup_host.sh - -cd "$HOME/github/gcsfuse" -echo "Building docker image containing all pytorch libraries..." -sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/dino/Dockerfile --tag pytorch-gcsfuse - -mkdir -p container_artifacts - -echo "Running the docker image build in the previous step..." -sudo docker run --gpus all --name=pytorch_automation_container --privileged -d -v $HOME/github/gcsfuse/container_artifacts:/pytorch_dino/run_artifacts:rw,rshared \ ---shm-size=128g pytorch-gcsfuse:latest - -# Setup the log_rotation. -source perfmetrics/scripts/ml_tests/setup_log_rotation.sh $HOME/github/gcsfuse/container_artifacts/gcsfuse.log - -# Wait for the script completion as well as logs output. -sudo docker logs -f pytorch_automation_container diff --git a/perfmetrics/scripts/ml_tests/pytorch/run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/run_container.sh new file mode 100644 index 0000000000..f0f787a9c9 --- /dev/null +++ b/perfmetrics/scripts/ml_tests/pytorch/run_container.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e +# pytorch version (e.g. v1_12, v2) +PYTORCH_VESRION=$1 +cd "$HOME/github/gcsfuse" +echo "Building docker image containing all pytorch libraries..." +sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${PYTORCH_VESRION}/dino/Dockerfile --tag pytorch-gcsfuse + +mkdir -p container_artifacts + +echo "Running the docker image build in the previous step..." +sudo docker run --gpus all --name=pytorch_automation_container --privileged -d -v $HOME/github/gcsfuse/container_artifacts:/pytorch_dino/run_artifacts:rw,rshared \ +--shm-size=128g pytorch-gcsfuse:latest + +# Setup the log_rotation. +source perfmetrics/scripts/ml_tests/setup_log_rotation.sh $HOME/github/gcsfuse/container_artifacts/gcsfuse.log + +# Wait for the script completion as well as logs output. +sudo docker logs -f pytorch_automation_container diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh b/perfmetrics/scripts/ml_tests/pytorch/run_model.sh similarity index 80% rename from perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh rename to perfmetrics/scripts/ml_tests/pytorch/run_model.sh index cc856e1ce8..486a08b8be 100755 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh +++ b/perfmetrics/scripts/ml_tests/pytorch/run_model.sh @@ -1,4 +1,19 @@ #!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PYTORCH_VESRION=$1 # Install golang wget -O go_tar.tar.gz https://go.dev/dl/go1.21.4.linux-amd64.tar.gz -q @@ -39,7 +54,7 @@ def pil_loader(path: str) -> Image.Image: return rgb_img " > bypassed_code.py -folder_file="/opt/conda/lib/python3.7/site-packages/torchvision/datasets/folder.py" +folder_file="/opt/conda/lib/python3.10/site-packages/torchvision/datasets/folder.py" x=$(grep -n "def pil_loader(path: str) -> Image.Image:" $folder_file | cut -f1 -d ':') y=$(grep -n "def accimage_loader(path: str) -> Any:" $folder_file | cut -f1 -d ':') y=$((y - 2)) @@ -51,7 +66,7 @@ sed -i "$x"'r bypassed_code.py' $folder_file # nproc_per_node - by downloading the model in single thread environment. python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")' -ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/dino" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${PYTORCH_VESRION}/dino" echo "Update status file" echo "RUNNING" > status.txt gsutil cp status.txt $ARTIFACTS_BUCKET_PATH/ @@ -66,7 +81,7 @@ gsutil cp start_time.txt $ARTIFACTS_BUCKET_PATH/ # We need to run it in foreground mode to make the container running. echo "Running the pytorch dino model..." experiment=dino_experiment - python3 -m torch.distributed.launch \ + torchrun \ --nproc_per_node=2 dino/main_dino.py \ --arch vit_small \ --num_workers 20 \ diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/Dockerfile b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile similarity index 86% rename from perfmetrics/scripts/ml_tests/pytorch/dino/Dockerfile rename to perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile index 8f0a34ed85..d96e780324 100644 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/Dockerfile +++ b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile @@ -24,9 +24,11 @@ WORKDIR "/pytorch_dino/" RUN git clone "https://github.com/facebookresearch/dino" -COPY perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh ./ +COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./ RUN mkdir -p "run_artifacts" RUN mkdir -p "gcsfuse_data" -ENTRYPOINT ["/bin/bash", "-c", "./setup_container.sh"] +ENV PYTORCH_VERSION="v1_12" + +ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}] diff --git a/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh new file mode 100755 index 0000000000..c4a4580b64 --- /dev/null +++ b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +cd "$HOME/github/gcsfuse/perfmetrics/scripts" + +echo "Setting up the machine with Docker and Nvidia Driver" +# Driver version for A100 GPUs is 450.172.01 +DRIVER_VERSION="450.172.01" +source ml_tests/setup_host.sh $DRIVER_VERSION + +PYTORCH_VERSION="v1_12" +source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION diff --git a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile new file mode 100644 index 0000000000..e06a713020 --- /dev/null +++ b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile @@ -0,0 +1,46 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Image with gcsfuse installed and its package (.deb) +FROM gcr.io/deeplearning-platform-release/pytorch-gpu.2-0.py310 + +# Allow non-root users to specify the allow_other or allow_root mount options +RUN echo "user_allow_other" > /etc/fuse.conf + +RUN pip3 install timm + +WORKDIR "/pytorch_dino/" + +RUN git clone "https://github.com/facebookresearch/dino" +# (TulsiShah) TODO: The current docker image does not support the dino model with compile mode. +# We can unblock the below code whenever the docker image supports the same to run. + +# WORKDIR "/pytorch_dino/dino" +# RUN echo '[remote "origin"]' >> .git/config +# RUN echo ' fetch = +refs/pull/262/head:refs/remotes/origin/pr/262' >> .git/config +# +# RUN git fetch origin +# RUN git diff origin/main origin/pr/262 > diff.patch +# RUN git apply diff.patch +# +# WORKDIR "/pytorch_dino/" + +COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./ + +RUN mkdir -p "run_artifacts" +RUN mkdir -p "gcsfuse_data" + +ENV PYTORCH_VERSION="v2" + +ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"] diff --git a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh new file mode 100755 index 0000000000..f2edd6f886 --- /dev/null +++ b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +cd "$HOME/github/gcsfuse/perfmetrics/scripts" + +echo "Setting up the machine with Docker and Nvidia Driver" +# Driver version for L4 GPUs is 525.60.13 +DRIVER_VERSION="525.60.13" +source ml_tests/setup_host.sh $DRIVER_VERSION + +PYTORCH_VERSION="v2" +source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION diff --git a/perfmetrics/scripts/ml_tests/setup_host.sh b/perfmetrics/scripts/ml_tests/setup_host.sh index 464da67666..e89c77e5c2 100755 --- a/perfmetrics/scripts/ml_tests/setup_host.sh +++ b/perfmetrics/scripts/ml_tests/setup_host.sh @@ -2,6 +2,8 @@ # This file installs docker engine and nvidia driver and nvidia container tool # necessary for running dlc container on the vm +DRIVER_VERSION=$1 + # Install Ops-agent to get the memory and processes' related data on VM console. curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh @@ -32,7 +34,6 @@ sudo apt-get install docker-ce docker-ce-cli containerd.io docker-compose-plugin echo "Installing driver..." sudo apt update && sudo apt install -y build-essential BASE_URL=https://us.download.nvidia.com/tesla -DRIVER_VERSION=450.172.01 sudo curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run sudo sh NVIDIA-Linux-x86_64-$DRIVER_VERSION.run -s From b0970f408f47fe5e4b34f1b36cc4af59d560f297 Mon Sep 17 00:00:00 2001 From: Tulsi Shah Date: Mon, 27 Nov 2023 10:11:04 +0000 Subject: [PATCH 4/5] empty commit From b24102bd747c287dfa5afd92580906a0bcc4edf2 Mon Sep 17 00:00:00 2001 From: Tulsi Shah Date: Mon, 27 Nov 2023 16:41:14 +0000 Subject: [PATCH 5/5] empty commit