Skip to content

Commit

Permalink
Test
Browse files Browse the repository at this point in the history
  • Loading branch information
IrvingMg committed Jan 8, 2025
1 parent 517b141 commit 38a9338
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 29 deletions.
38 changes: 19 additions & 19 deletions .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ env:
A3_MEGA_TEST_CLUSTER_NAME: "xpk-mega-ctk-int"
A3_ULTRA_TEST_CLUSTER_NAME: "xpk-ultra-ctk-int"
GKE_ML_TEST_CLUSTER_NAME: "xpk-gke-ml"
ZONE: us-central2-a
REGION: us-central2
ZONE: europe-west4-b
REGION: europe-west4

jobs:
# run-unit-tests:
Expand Down Expand Up @@ -140,40 +140,40 @@ jobs:
- name: Check xpk installation
run: xpk --help
- name: Create a private Pathways-enabled XPK Cluster with 2x $TPU_TYPE nodepools. Larger num-nodes to avoid master resizing.
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --private --tpu-type=$TPU_TYPE --num-slices=1 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
- name: Verify the created cluster is private
run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --tpu-type=$TPU_TYPE --num-slices=2 --zone=europe-west4-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=1 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}'
# - name: Verify the created cluster is private
# run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=europe-west4 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=2 --zone=europe-west4-b
- name: Run xpk inspector with the workload created above
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --workload $WORKLOAD_NAME
- name: Wait for workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=2 --zone=europe-west4-b --command "echo \"Hello world from a test script! \""
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: List out the workloads on the cluster
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
- name: Run xpk info
run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep -P "^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2x$TPU_TYPE:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)" || (echo 'Invalid command output' && cat output.txt && exit 1)
run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | tee output.txt | grep -P "^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2x$TPU_TYPE:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)" || (echo 'Invalid command output' && cat output.txt && exit 1)
- name: Delete the workload on the cluster
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
- name: Delete the Pathways workload on the cluster
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
- name: Create test script to execute in batch
run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
- name: Run a batch job on the cluster
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3 --time 60
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3 --time 60
- name: List out the jobs on the cluster
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-'
- name: Get created job name
run: |
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
- name: Check created job
run: |
Expand All @@ -186,7 +186,7 @@ jobs:
run: python3 xpk.py job info ${JOB_NAME} | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
- name: Cancel the batch job on the cluster
run: |
python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep "job.batch/${JOB_NAME} deleted"
- name: Create shell and exit it immediately
run: |
cat <<'EOF' >> create-shell.exp
Expand All @@ -204,7 +204,7 @@ jobs:
run: python3 xpk.py shell stop
- name: Delete the cluster created
if: always()
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --force



Expand Down
1 change: 1 addition & 0 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,7 @@ def run_gke_cluster_create_command(
f' --num-nodes {args.default_pool_cpu_num_nodes}'
f' {args.custom_cluster_arguments}'
f' {rapid_release_cmd}'
f' --verbosity=debug'
)

enable_ip_alias = False
Expand Down
4 changes: 1 addition & 3 deletions src/xpk/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1295,7 +1295,7 @@ def run_gke_node_pool_create_command(
command += (
f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
)
# command += f' --tpu-topology={system.topology}'
command += f' --tpu-topology={system.topology}'
command += f' {args.custom_tpu_nodepool_arguments}'
elif system.accelerator_type == AcceleratorType['GPU']:
subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
Expand Down Expand Up @@ -1351,8 +1351,6 @@ def run_gke_node_pool_create_command(
create_commands.append(command)
create_task_names.append(task)

xpk_print(f"\n{create_commands}\n")

for i, command in enumerate(create_commands):
xpk_print(f'To complete {create_task_names[i]} we are executing {command}')
max_return_code = run_commands(
Expand Down
14 changes: 7 additions & 7 deletions src/xpk/core/system_characteristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,13 +1106,13 @@ def get_system_characteristics_by_device_type(
),
# v5litepod
'v5litepod-8': SystemCharacteristics(
'2x4',
1,
'tpu-v5-lite-podslice',
'ct5lp-hightpu-8t',
8,
AcceleratorType['TPU'],
'v5litepod-8',
'2x4', # topology
2, # vms_per_slice
'tpu-v5-lite-podslice', # gke_accelerator
'ct5lp-hightpu-4t', # gce_machine_type
8, # chips_per_vm
AcceleratorType['TPU'], # accelerator_type
'v5litepod-8', # device_type
),
'v5litepod-16': SystemCharacteristics(
'4x4',
Expand Down

0 comments on commit 38a9338

Please sign in to comment.