diff --git a/checks/slurm/multi_launch.py b/checks/slurm/multi_launch.py index 8b52f39..0f91869 100644 --- a/checks/slurm/multi_launch.py +++ b/checks/slurm/multi_launch.py @@ -31,3 +31,51 @@ def validate_test(self): return sn.assert_eq( sn.count(sn.extractall(r'^nid\d+', self.stdout)), self.num_tasks ) + +@rfm.simple_test +class MultiLaunchGPUTest(rfm.RunOnlyRegressionTest): + valid_systems = ['lumi:gpu'] + valid_prog_environs = ['builtin'] + executable = 'wait' + num_tasks_per_node = 1 + num_gpus_per_node = 8 + num_nodes = 1 + num_tasks = 8 + exclusive_access = True + modules = ['lumi-CPEtools'] + + tags = {'production', 'lumi'} + + @run_after('init') + def add_select_gpu_wrapper(self): + self.prerun_cmds += [ + 'cat << EOF > select_step_gpu', + '#!/bin/bash', + 'CPU_MAP="49-55 57-63 17-23 25-31 1-7 9-15 33-39 41-47"', + 'CPUS=(\${CPU_MAP})', + 'export ROCR_VISIBLE_DEVICES=\$SLURM_STEPID', + 'exec numactl --physcpubind=\${CPUS[\$SLURM_STEPID]} \$*', + 'EOF', + 'chmod +x ./select_step_gpu', + ] + + @run_before('run') + def pre_launch(self): + self.job.options += ['--cpus-per-task=56'] + cmd = self.job.launcher.run_command(self.job) + background_cmd = 'gpu_check -l' + self.prerun_cmds += [ + f'{cmd} -n 1 --overlap --exact ./select_step_gpu {background_cmd} &' + for n in range(0, self.num_tasks-1) + ] + + @run_before('run') + def set_launcher(self): + self.job.launcher = getlauncher('local')() + + @sanity_function + def check_cpu_gpu_numa_bind(self): + cpu_bind = sn.extractall(r'\(CCD(?P\S+)\)', self.stdout, 'number', int) + + gpu_bind = sn.extractall(r'\(GCD\S+\/CCD(?P\S+)\)', self.stdout, 'number', int) + return sn.assert_eq(cpu_bind, gpu_bind)