Skip to content

Commit 80c9275

Browse files
Enabling cooperative multi-gpu tests on multi-gpu nodes (#27986)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
1 parent e50c454 commit 80c9275

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,14 @@ fi
173173
PARALLEL_JOB_COUNT=8
174174
MYPYTHONPATH=".."
175175

176+
# Test that we're launching on the machine that has
177+
# proper access to GPUs
178+
render_gid=$(getent group render | cut -d: -f3)
179+
if [[ -z "$render_gid" ]]; then
180+
echo "Error: 'render' group not found. This is required for GPU access." >&2
181+
exit 1
182+
fi
183+
176184
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
177185
if [[ $commands == *"--shard-id="* ]]; then
178186
# assign job count as the number of shards used
@@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
186194
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
187195
--network=host \
188196
--shm-size=16gb \
197+
--group-add "$render_gid" \
189198
--rm \
190199
-e HIP_VISIBLE_DEVICES="${GPU}" \
191200
-e HF_TOKEN \
@@ -217,8 +226,8 @@ else
217226
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
218227
--network=host \
219228
--shm-size=16gb \
229+
--group-add "$render_gid" \
220230
--rm \
221-
-e HIP_VISIBLE_DEVICES=0 \
222231
-e HF_TOKEN \
223232
-e AWS_ACCESS_KEY_ID \
224233
-e AWS_SECRET_ACCESS_KEY \

0 commit comments

Comments
 (0)