Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Support v5litepod-8. #292

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 84 additions & 77 deletions .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ on:
type: choice
options:
- v4-8
push:
branches: ["main"]
pull_request: # By default this runs for types assigned, opened and synchronize.
- v5litepod-8
# push:
# branches: ["main"]
# pull_request: # By default this runs for types assigned, opened and synchronize.

env:
# Names must be unique in parallel running tests.
Expand All @@ -35,77 +36,83 @@ env:
WORKLOAD_NAME: xpktest-build-${{ github.run_attempt }}
PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }}
CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
RUN_ID: "pr-${{ github.event.number }}"
PROJECT_ID: ${{secrets.PROJECT_NAME}}
A3_MEGA_TEST_CLUSTER_NAME: "xpk-mega-ctk-int"
A3_ULTRA_TEST_CLUSTER_NAME: "xpk-ultra-ctk-int"
GKE_ML_TEST_CLUSTER_NAME: "xpk-gke-ml"
ZONE: us-central2-a
REGION: us-central2
ZONE: europe-west4-b
REGION: europe-west4

jobs:
run-unit-tests:
runs-on: [ubuntu-22.04]
concurrency: # We support one build or nightly test to run at a time currently.
group: build-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run : make install-dev
- name: Run unit tests
run: make run-unittests

run-integration-tests:
runs-on: [ubuntu-22.04]
needs: [run-unit-tests]
concurrency: # We support one build or nightly test to run at a time currently.
group: build-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Verify gcp setup
run: gcloud info
- name: Install dependencies
run : make install-dev
- name: "Set auth cidr"
run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV
- name: "Set GCLOUD_CFG_PATH"
run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV
- name: "Copy credentials"
run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json
- name: "Set DEPLOYMENT_DIR"
run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV
- name: Create deployment dir
run: mkdir -p $DEPLOYMENT_DIR
- name: Run integration tests
run: make run-integrationtests
# run-unit-tests:
# runs-on: [ubuntu-22.04]
# concurrency: # We support one build or nightly test to run at a time currently.
# group: build-test-cluster-group
# cancel-in-progress: false
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: '3.10'
# - name: Install dependencies
# run : make install-dev
# - name: Run unit tests
# run: make run-unittests
#
# run-integration-tests:
# runs-on: [ubuntu-22.04]
# needs: [run-unit-tests]
# concurrency: # We support one build or nightly test to run at a time currently.
# group: build-test-cluster-group
# cancel-in-progress: false
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: '3.10'
# - uses: 'google-github-actions/auth@v2'
# with:
# credentials_json: '${{ secrets.GCP_SA_KEY }}'
# - uses: google-github-actions/setup-gcloud@v2
# with:
# version: '>= 363.0.0'
# install_components: 'beta,gke-gcloud-auth-plugin'
# - name: Verify gcp setup
# run: gcloud info
# - name: Install dependencies
# run : make install-dev
# - name: "Set auth cidr"
# run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV
# - name: "Set GCLOUD_CFG_PATH"
# run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV
# - name: "Copy credentials"
# run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json
# - name: "Set DEPLOYMENT_DIR"
# run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV
# - name: Create deployment dir
# run: mkdir -p $DEPLOYMENT_DIR
# - name: Run integration tests
# run: make run-integrationtests

cluster-create-and-delete:
runs-on: [ubuntu-22.04]
needs: [run-integration-tests]
# needs: [run-integration-tests]
concurrency: # We support one nightly test and one build test for each branch to run at a time currently.
group: build-test-cluster-group-${{ github.ref }}
cancel-in-progress: false
steps:
- name: Change RUN_ID env var if merge to main
run: echo "RUN_ID=main" >> $GITHUB_ENV
if: ${{ github.ref == 'refs/heads/main' }}
- name: Initialize RUN_ID env var
run: |
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
RUN_ID="dispatch"
elif [ "${{ github.ref }}" == "refs/heads/main" ]; then
RUN_ID="main"
else
RUN_ID="pr-${{ github.event.number }}"
fi
echo "RUN_ID=$RUN_ID" >> $GITHUB_ENV
- name: Update cluster name with TPU_TYPE and RUN_ID
run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$TPU_TYPE-$RUN_ID" >> $GITHUB_ENV
run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$RUN_ID" >> $GITHUB_ENV
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
Expand All @@ -132,40 +139,40 @@ jobs:
- name: Check xpk installation
run: xpk --help
- name: Create a private Pathways-enabled XPK Cluster with 2x $TPU_TYPE nodepools. Larger num-nodes to avoid master resizing.
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --private --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
- name: Verify the created cluster is private
run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --tpu-type=$TPU_TYPE --num-slices=1 --zone=europe-west4-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=1 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}'
# - name: Verify the created cluster is private
# run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=europe-west4 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=1 --zone=europe-west4-b
- name: Run xpk inspector with the workload created above
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --workload $WORKLOAD_NAME
- name: Wait for workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=1 --zone=europe-west4-b --command "echo \"Hello world from a test script! \""
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: List out the workloads on the cluster
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
- name: Run xpk info
run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep -P "^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2x$TPU_TYPE:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)" || (echo 'Invalid command output' && cat output.txt && exit 1)
run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | tee output.txt | grep -P "^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2x$TPU_TYPE:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)" || (echo 'Invalid command output' && cat output.txt && exit 1)
- name: Delete the workload on the cluster
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
- name: Delete the Pathways workload on the cluster
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
- name: Create test script to execute in batch
run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
- name: Run a batch job on the cluster
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3 --time 60
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3 --time 60
- name: List out the jobs on the cluster
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-'
- name: Get created job name
run: |
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
- name: Check created job
run: |
Expand All @@ -178,7 +185,7 @@ jobs:
run: python3 xpk.py job info ${JOB_NAME} | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
- name: Cancel the batch job on the cluster
run: |
python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep "job.batch/${JOB_NAME} deleted"
- name: Create shell and exit it immediately
run: |
cat <<'EOF' >> create-shell.exp
Expand All @@ -196,7 +203,7 @@ jobs:
run: python3 xpk.py shell stop
- name: Delete the cluster created
if: always()
run: echo 'y' | python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --force



Expand Down
9 changes: 5 additions & 4 deletions .github/workflows/lint_and_format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
name: Lint and Format

on:
pull_request:
push:
branches:
- main
workflow_dispatch:
# pull_request:
# push:
# branches:
# - main

jobs:
build-and-test:
Expand Down
1 change: 1 addition & 0 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,7 @@ def run_gke_cluster_create_command(
f' --num-nodes {args.default_pool_cpu_num_nodes}'
f' {args.custom_cluster_arguments}'
f' {rapid_release_cmd}'
' --verbosity=debug'
)

enable_ip_alias = False
Expand Down
2 changes: 1 addition & 1 deletion src/xpk/core/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
children.append(
# subprocess managed by list pylint: disable=consider-using-with
subprocess.Popen(
command, stdout=output_logs[i], stderr=output_logs[i], shell=True
command, stdout=sys.stdout, stderr=sys.stderr, shell=True
)
)

Expand Down
4 changes: 2 additions & 2 deletions src/xpk/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,7 +1277,7 @@ def run_gke_node_pool_create_command(
if node_pool_name in node_pools_to_remain:
continue
command = (
'gcloud beta container node-pools create'
'gcloud beta container node-pools create --verbosity=debug'
f' {node_pool_name}'
f' --region={zone_to_region(args.zone)}'
f' --cluster={args.cluster}'
Expand Down Expand Up @@ -1342,7 +1342,7 @@ def run_gke_node_pool_create_command(
if node_pool_name in existing_node_pool_names:
continue
command = (
'gcloud beta container node-pools create'
'gcloud beta container node-pools create --verbosity=debug'
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
' --min-nodes=1 --max-nodes=20'
Expand Down
9 changes: 9 additions & 0 deletions src/xpk/core/system_characteristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,15 @@ def get_system_characteristics_by_device_type(
'v5p-17920',
),
# v5litepod
'v5litepod-8': SystemCharacteristics(
'2x4',
2,
'tpu-v5-lite-podslice',
'ct5lp-hightpu-4t',
8,
AcceleratorType['TPU'],
'v5litepod-8',
),
'v5litepod-16': SystemCharacteristics(
'4x4',
4,
Expand Down