File tree Expand file tree Collapse file tree 1 file changed +10
-1
lines changed
.buildkite/scripts/hardware_ci Expand file tree Collapse file tree 1 file changed +10
-1
lines changed Original file line number Diff line number Diff line change 173173PARALLEL_JOB_COUNT=8
174174MYPYTHONPATH=" .."
175175
176+ # Test that we're launching on the machine that has
177+ # proper access to GPUs
178+ render_gid=$( getent group render | cut -d: -f3)
179+ if [[ -z " $render_gid " ]]; then
180+ echo " Error: 'render' group not found. This is required for GPU access." >&2
181+ exit 1
182+ fi
183+
176184# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
177185if [[ $commands == * " --shard-id=" * ]]; then
178186 # assign job count as the number of shards used
@@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
186194 --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
187195 --network=host \
188196 --shm-size=16gb \
197+ --group-add " $render_gid " \
189198 --rm \
190199 -e HIP_VISIBLE_DEVICES=" ${GPU} " \
191200 -e HF_TOKEN \
217226 --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
218227 --network=host \
219228 --shm-size=16gb \
229+ --group-add " $render_gid " \
220230 --rm \
221- -e HIP_VISIBLE_DEVICES=0 \
222231 -e HF_TOKEN \
223232 -e AWS_ACCESS_KEY_ID \
224233 -e AWS_SECRET_ACCESS_KEY \
You can’t perform that action at this time.
0 commit comments