Skip to content

Commit

Permalink
[CI] fix gpu selector (#703)
Browse files Browse the repository at this point in the history
* fix no Find suitable GPU

* update GPU ip
  • Loading branch information
CSY-ModelCloud authored Nov 28, 2024
1 parent c72b723 commit 6a98dbc
Showing 1 changed file with 27 additions and 14 deletions.
41 changes: 27 additions & 14 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@ jobs:
run: |
echo "ip=$RUNNER" >> "$GITHUB_OUTPUT"
echo "GPU_IP=$RUNNER" >> $GITHUB_ENV
echo "ip: $ip"
if [ -n "${{ github.event.inputs.artifact_id }}" ]; then
Expand Down Expand Up @@ -300,7 +298,6 @@ jobs:
- name: Check platform
run: |
ip=${RUNNER}
echo "GPU_IP=$ip" >> $GITHUB_ENV
echo "-----------"
pip show torch
echo "-----------"
Expand All @@ -312,10 +309,10 @@ jobs:
gpu_id=-1
while [ "$gpu_id" -lt 0 ]; do
gpu_id=$(curl -s "http://${{ env.GPU_IP }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")
gpu_id=$(curl -s "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")
if [ "$gpu_id" -lt 0 ]; then
echo "http://${{ env.GPU_IP }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "No available GPU, waiting 5 seconds..."
sleep 5
else
Expand All @@ -338,7 +335,7 @@ jobs:
- name: Release GPU
if: always()
run: curl -X GET "http://${{ env.GPU_IP }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"
run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"

transformers_diff:
needs:
Expand Down Expand Up @@ -382,8 +379,6 @@ jobs:
nvcc --version
echo "== torch =="
pip show torch
ip=${{ needs.check-vm.outputs.ip }}
echo "GPU_IP=$ip" >> $GITHUB_ENV
- name: Download wheel
continue-on-error: true
Expand Down Expand Up @@ -424,10 +419,10 @@ jobs:
gpu_id=-1
while [ "$gpu_id" -lt 0 ]; do
gpu_id=$(curl -s "http://${{ env.GPU_IP }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")
gpu_id=$(curl -s "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")
if [ "$gpu_id" -lt 0 ]; then
echo "http://${{ env.GPU_IP }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "No available GPU, waiting 5 seconds..."
sleep 5
else
Expand All @@ -444,7 +439,7 @@ jobs:

- name: Release GPU
if: always()
run: curl -X GET "http://${{ env.GPU_IP }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"
run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"

torch2_5:
needs:
Expand Down Expand Up @@ -488,8 +483,6 @@ jobs:
nvcc --version
echo "== torch =="
pip show torch
ip=${{ needs.check-vm.outputs.ip }}
echo "GPU_IP=$ip" >> $GITHUB_ENV
- name: Download wheel
continue-on-error: true
Expand Down Expand Up @@ -519,10 +512,30 @@ jobs:
pip install intel_extension_for_pytorch auto_round bitblas==0.0.1.dev13 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
pip install dist/*.whl
- name: Find suitable GPU
run: |
timestamp=$(date +%s%3N)
gpu_id=-1
while [ "$gpu_id" -lt 0 ]; do
gpu_id=$(curl -s "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")
if [ "$gpu_id" -lt 0 ]; then
echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "No available GPU, waiting 5 seconds..."
sleep 5
else
echo "Allocated GPU ID: $gpu_id"
fi
done
echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV
echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV
echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp"
- name: Run tests
if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
run: pytest --durations=0 tests/${{ matrix.test_script }}.py

- name: Release GPU
if: always()
run: curl -X GET "http://${{ env.GPU_IP }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"
run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"

0 comments on commit 6a98dbc

Please sign in to comment.