Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI] fix gpu selector #703

Merged
merged 2 commits into from
Nov 28, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 27 additions & 14 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@ jobs:
run: |
echo "ip=$RUNNER" >> "$GITHUB_OUTPUT"

echo "GPU_IP=$RUNNER" >> $GITHUB_ENV

echo "ip: $ip"

if [ -n "${{ github.event.inputs.artifact_id }}" ]; then
Expand Down Expand Up @@ -300,7 +298,6 @@ jobs:
- name: Check platform
run: |
ip=${RUNNER}
echo "GPU_IP=$ip" >> $GITHUB_ENV
echo "-----------"
pip show torch
echo "-----------"
Expand All @@ -312,10 +309,10 @@ jobs:
gpu_id=-1

while [ "$gpu_id" -lt 0 ]; do
gpu_id=$(curl -s "http://${{ env.GPU_IP }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")
gpu_id=$(curl -s "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")

if [ "$gpu_id" -lt 0 ]; then
echo "http://${{ env.GPU_IP }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "No available GPU, waiting 5 seconds..."
sleep 5
else
Expand All @@ -338,7 +335,7 @@ jobs:

- name: Release GPU
if: always()
run: curl -X GET "http://${{ env.GPU_IP }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"
run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"

transformers_diff:
needs:
Expand Down Expand Up @@ -382,8 +379,6 @@ jobs:
nvcc --version
echo "== torch =="
pip show torch
ip=${{ needs.check-vm.outputs.ip }}
echo "GPU_IP=$ip" >> $GITHUB_ENV

- name: Download wheel
continue-on-error: true
Expand Down Expand Up @@ -424,10 +419,10 @@ jobs:
gpu_id=-1

while [ "$gpu_id" -lt 0 ]; do
gpu_id=$(curl -s "http://${{ env.GPU_IP }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")
gpu_id=$(curl -s "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")

if [ "$gpu_id" -lt 0 ]; then
echo "http://${{ env.GPU_IP }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "No available GPU, waiting 5 seconds..."
sleep 5
else
Expand All @@ -444,7 +439,7 @@ jobs:

- name: Release GPU
if: always()
run: curl -X GET "http://${{ env.GPU_IP }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"
run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"

torch2_5:
needs:
Expand Down Expand Up @@ -488,8 +483,6 @@ jobs:
nvcc --version
echo "== torch =="
pip show torch
ip=${{ needs.check-vm.outputs.ip }}
echo "GPU_IP=$ip" >> $GITHUB_ENV

- name: Download wheel
continue-on-error: true
Expand Down Expand Up @@ -519,10 +512,30 @@ jobs:
pip install intel_extension_for_pytorch auto_round bitblas==0.0.1.dev13 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
pip install dist/*.whl

- name: Find suitable GPU
run: |
timestamp=$(date +%s%3N)
gpu_id=-1

while [ "$gpu_id" -lt 0 ]; do
gpu_id=$(curl -s "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")

if [ "$gpu_id" -lt 0 ]; then
echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
echo "No available GPU, waiting 5 seconds..."
sleep 5
else
echo "Allocated GPU ID: $gpu_id"
fi
done
echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV
echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV
echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp"

- name: Run tests
if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
run: pytest --durations=0 tests/${{ matrix.test_script }}.py

- name: Release GPU
if: always()
run: curl -X GET "http://${{ env.GPU_IP }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"
run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"