diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index 30be9873..d793ce74 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -24,9 +24,10 @@ on: type: choice options: - v4-8 - push: - branches: ["main"] - pull_request: # By default this runs for types assigned, opened and synchronize. + - v5litepod-8 +# push: +# branches: ["main"] +# pull_request: # By default this runs for types assigned, opened and synchronize. env: # Names must be unique in parallel running tests. @@ -35,77 +36,83 @@ env: WORKLOAD_NAME: xpktest-build-${{ github.run_attempt }} PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }} CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50" - RUN_ID: "pr-${{ github.event.number }}" PROJECT_ID: ${{secrets.PROJECT_NAME}} A3_MEGA_TEST_CLUSTER_NAME: "xpk-mega-ctk-int" A3_ULTRA_TEST_CLUSTER_NAME: "xpk-ultra-ctk-int" GKE_ML_TEST_CLUSTER_NAME: "xpk-gke-ml" - ZONE: us-central2-a - REGION: us-central2 + ZONE: europe-west4-b + REGION: europe-west4 jobs: - run-unit-tests: - runs-on: [ubuntu-22.04] - concurrency: # We support one build or nightly test to run at a time currently. - group: build-test-cluster-group - cancel-in-progress: false - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Install dependencies - run : make install-dev - - name: Run unit tests - run: make run-unittests - - run-integration-tests: - runs-on: [ubuntu-22.04] - needs: [run-unit-tests] - concurrency: # We support one build or nightly test to run at a time currently. - group: build-test-cluster-group - cancel-in-progress: false - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - uses: 'google-github-actions/auth@v2' - with: - credentials_json: '${{ secrets.GCP_SA_KEY }}' - - uses: google-github-actions/setup-gcloud@v2 - with: - version: '>= 363.0.0' - install_components: 'beta,gke-gcloud-auth-plugin' - - name: Verify gcp setup - run: gcloud info - - name: Install dependencies - run : make install-dev - - name: "Set auth cidr" - run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV - - name: "Set GCLOUD_CFG_PATH" - run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV - - name: "Copy credentials" - run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json - - name: "Set DEPLOYMENT_DIR" - run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV - - name: Create deployment dir - run: mkdir -p $DEPLOYMENT_DIR - - name: Run integration tests - run: make run-integrationtests +# run-unit-tests: +# runs-on: [ubuntu-22.04] +# concurrency: # We support one build or nightly test to run at a time currently. +# group: build-test-cluster-group +# cancel-in-progress: false +# steps: +# - uses: actions/checkout@v4 +# - uses: actions/setup-python@v5 +# with: +# python-version: '3.10' +# - name: Install dependencies +# run : make install-dev +# - name: Run unit tests +# run: make run-unittests +# +# run-integration-tests: +# runs-on: [ubuntu-22.04] +# needs: [run-unit-tests] +# concurrency: # We support one build or nightly test to run at a time currently. +# group: build-test-cluster-group +# cancel-in-progress: false +# steps: +# - uses: actions/checkout@v4 +# - uses: actions/setup-python@v5 +# with: +# python-version: '3.10' +# - uses: 'google-github-actions/auth@v2' +# with: +# credentials_json: '${{ secrets.GCP_SA_KEY }}' +# - uses: google-github-actions/setup-gcloud@v2 +# with: +# version: '>= 363.0.0' +# install_components: 'beta,gke-gcloud-auth-plugin' +# - name: Verify gcp setup +# run: gcloud info +# - name: Install dependencies +# run : make install-dev +# - name: "Set auth cidr" +# run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV +# - name: "Set GCLOUD_CFG_PATH" +# run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV +# - name: "Copy credentials" +# run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json +# - name: "Set DEPLOYMENT_DIR" +# run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV +# - name: Create deployment dir +# run: mkdir -p $DEPLOYMENT_DIR +# - name: Run integration tests +# run: make run-integrationtests cluster-create-and-delete: runs-on: [ubuntu-22.04] - needs: [run-integration-tests] +# needs: [run-integration-tests] concurrency: # We support one nightly test and one build test for each branch to run at a time currently. group: build-test-cluster-group-${{ github.ref }} cancel-in-progress: false steps: - - name: Change RUN_ID env var if merge to main - run: echo "RUN_ID=main" >> $GITHUB_ENV - if: ${{ github.ref == 'refs/heads/main' }} + - name: Initialize RUN_ID env var + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + RUN_ID="dispatch" + elif [ "${{ github.ref }}" == "refs/heads/main" ]; then + RUN_ID="main" + else + RUN_ID="pr-${{ github.event.number }}" + fi + echo "RUN_ID=$RUN_ID" >> $GITHUB_ENV - name: Update cluster name with TPU_TYPE and RUN_ID - run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$TPU_TYPE-$RUN_ID" >> $GITHUB_ENV + run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$RUN_ID" >> $GITHUB_ENV - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: @@ -132,40 +139,40 @@ jobs: - name: Check xpk installation run: xpk --help - name: Create a private Pathways-enabled XPK Cluster with 2x $TPU_TYPE nodepools. Larger num-nodes to avoid master resizing. - run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --private --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" - - name: Verify the created cluster is private - run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1) + run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --tpu-type=$TPU_TYPE --num-slices=1 --zone=europe-west4-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=1 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}' + # - name: Verify the created cluster is private + # run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=europe-west4 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1) - name: Authenticate Docker run: gcloud auth configure-docker --quiet - name: Create test script to execute in workloads run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh - name: Run a base-docker-image workload - run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b + run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=1 --zone=europe-west4-b - name: Run xpk inspector with the workload created above - run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME + run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --workload $WORKLOAD_NAME - name: Wait for workload completion and confirm it succeeded - run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300 + run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300 - name: Run a Pathways workload on Ubuntu base image - run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \"" + run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=1 --zone=europe-west4-b --command "echo \"Hello world from a test script! \"" - name: Wait for Pathways workload completion and confirm it succeeded - run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300 + run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300 - name: List out the workloads on the cluster - run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b + run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b - name: Run xpk info - run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep -P "^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2x$TPU_TYPE:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)" || (echo 'Invalid command output' && cat output.txt && exit 1) + run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | tee output.txt | grep -P "^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2x$TPU_TYPE:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)" || (echo 'Invalid command output' && cat output.txt && exit 1) - name: Delete the workload on the cluster - run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b + run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b - name: Delete the Pathways workload on the cluster - run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b + run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b - name: Create test script to execute in batch run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh - name: Run a batch job on the cluster - run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3 --time 60 + run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3 --time 60 - name: List out the jobs on the cluster - run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' + run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-' - name: Get created job name run: | - JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}') + JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}') echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV - name: Check created job run: | @@ -178,7 +185,7 @@ jobs: run: python3 xpk.py job info ${JOB_NAME} | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7" - name: Cancel the batch job on the cluster run: | - python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted" + python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep "job.batch/${JOB_NAME} deleted" - name: Create shell and exit it immediately run: | cat <<'EOF' >> create-shell.exp @@ -196,7 +203,7 @@ jobs: run: python3 xpk.py shell stop - name: Delete the cluster created if: always() - run: echo 'y' | python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b + run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --force diff --git a/.github/workflows/lint_and_format.yml b/.github/workflows/lint_and_format.yml index 27e3cb74..9faa9c1e 100644 --- a/.github/workflows/lint_and_format.yml +++ b/.github/workflows/lint_and_format.yml @@ -15,10 +15,11 @@ name: Lint and Format on: - pull_request: - push: - branches: - - main + workflow_dispatch: +# pull_request: +# push: +# branches: +# - main jobs: build-and-test: diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index a5c9fb39..b4118ca7 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -545,6 +545,7 @@ def run_gke_cluster_create_command( f' --num-nodes {args.default_pool_cpu_num_nodes}' f' {args.custom_cluster_arguments}' f' {rapid_release_cmd}' + ' --verbosity=debug' ) enable_ip_alias = False diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index 9b9a6384..54c602a9 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -84,7 +84,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs): children.append( # subprocess managed by list pylint: disable=consider-using-with subprocess.Popen( - command, stdout=output_logs[i], stderr=output_logs[i], shell=True + command, stdout=sys.stdout, stderr=sys.stderr, shell=True ) ) diff --git a/src/xpk/core/core.py b/src/xpk/core/core.py index b0c8095b..5caea4c7 100644 --- a/src/xpk/core/core.py +++ b/src/xpk/core/core.py @@ -1277,7 +1277,7 @@ def run_gke_node_pool_create_command( if node_pool_name in node_pools_to_remain: continue command = ( - 'gcloud beta container node-pools create' + 'gcloud beta container node-pools create --verbosity=debug' f' {node_pool_name}' f' --region={zone_to_region(args.zone)}' f' --cluster={args.cluster}' @@ -1342,7 +1342,7 @@ def run_gke_node_pool_create_command( if node_pool_name in existing_node_pool_names: continue command = ( - 'gcloud beta container node-pools create' + 'gcloud beta container node-pools create --verbosity=debug' f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1' f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling' ' --min-nodes=1 --max-nodes=20' diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index 8808367b..c6fb2e6a 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -1105,6 +1105,15 @@ def get_system_characteristics_by_device_type( 'v5p-17920', ), # v5litepod + 'v5litepod-8': SystemCharacteristics( + '2x4', + 2, + 'tpu-v5-lite-podslice', + 'ct5lp-hightpu-4t', + 8, + AcceleratorType['TPU'], + 'v5litepod-8', + ), 'v5litepod-16': SystemCharacteristics( '4x4', 4,