From cc61ed36818e376eb29d2ea30104996173a994fa Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Wed, 22 Oct 2025 15:52:43 +0100 Subject: [PATCH 1/3] Adds bandwidth.yml playbook to download, build, and run nvbandwidth. bandwidth.yml is ran via cudatests.yml --- ansible/adhoc/cudatests.yml | 5 +++ ansible/roles/cuda/defaults/main.yml | 3 ++ ansible/roles/cuda/tasks/bandwidth.yml | 56 ++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 ansible/roles/cuda/tasks/bandwidth.yml diff --git a/ansible/adhoc/cudatests.yml b/ansible/adhoc/cudatests.yml index f571f8a89..8c325158a 100644 --- a/ansible/adhoc/cudatests.yml +++ b/ansible/adhoc/cudatests.yml @@ -7,3 +7,8 @@ - ansible.builtin.import_role: name: cuda tasks_from: samples.yml + + - name: Run CUDA bandwidth tasks + ansible.builtin.import_role: + name: cuda + tasks_from: bandwidth.yml diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 692301d23..14d3d90f7 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -16,3 +16,6 @@ cuda_samples_programs: - bandwidthTest # cuda_devices: # discovered from deviceQuery run cuda_persistenced_state: started +# variables for nvbandwidth (for bandwidth.yml tasks run in cudatests.yml) +cuda_bandwidth_path: "/var/lib/{{ ansible_user }}/cuda_bandwidth" +cuda_bandwidth_release_url: "https://github.com/NVIDIA/nvbandwidth/archive/refs/tags/v0.8.tar.gz" diff --git a/ansible/roles/cuda/tasks/bandwidth.yml b/ansible/roles/cuda/tasks/bandwidth.yml new file mode 100644 index 000000000..0d18088f6 --- /dev/null +++ b/ansible/roles/cuda/tasks/bandwidth.yml @@ -0,0 +1,56 @@ +--- +- name: Ensure cuda_bandwidth_path exists + ansible.builtin.file: + state: directory + path: "{{ cuda_bandwidth_path }}" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: "0755" + +- name: Download CUDA bandwith test release + ansible.builtin.unarchive: + remote_src: true + src: "{{ cuda_bandwidth_release_url }}" + dest: "{{ cuda_bandwidth_path }}" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + creates: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8" + +- name: Creates CUDA bandwidth test build directory + ansible.builtin.file: + state: directory + path: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build" + mode: "0755" + +- name: Ensure cudatests directory exists + ansible.builtin.file: + path: "{{ appliances_environment_root }}/cudatests" + state: directory + mode: '0755' + delegate_to: localhost + +- name: Build CUDA bandwidth test + ansible.builtin.shell: + cmd: | + source /cvmfs/software.eessi.io/versions/2023.06/init/bash && + module load Boost/1.82.0-GCC-12.3.0 && + . /etc/profile.d/sh.local && cmake .. && + make -j {{ ansible_processor_vcpus }} + chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build" + creates: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build/nvbandwidth" + +- name: Run CUDA bandwidth test + ansible.builtin.shell: | + export LD_LIBRARY_PATH=/cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen4/software/GCCcore/12.3.0/lib64:\ + /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen4/software/Boost/1.82.0-GCC-12.3.0/lib + ./nvbandwidth + args: + chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build/" + register: cuda_bandwidth_output + +- name: Save CUDA bandwidth output to bandwidth_results.txt + ansible.builtin.copy: + content: "{{ cuda_bandwidth_output.stdout }}" + dest: "{{ appliances_environment_root }}/cudatests/bandwidth_results.txt" + mode: '0644' + delegate_to: localhost From 0692a33a96d959247bbdeed8942132ef000289ea Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Tue, 28 Oct 2025 13:05:26 +0000 Subject: [PATCH 2/3] Adds bandwidth.yml playbook for NVIDIA nvbandwidth and removes samples.yml tasks from adhoc/cudatest.yml --- ansible/adhoc/cudatests.yml | 6 +----- ansible/roles/cuda/defaults/main.yml | 4 +++- ansible/roles/cuda/tasks/bandwidth.yml | 23 ++++++++++++----------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/ansible/adhoc/cudatests.yml b/ansible/adhoc/cudatests.yml index 8c325158a..0f5cf78a0 100644 --- a/ansible/adhoc/cudatests.yml +++ b/ansible/adhoc/cudatests.yml @@ -2,12 +2,8 @@ - hosts: cuda become: true gather_facts: true - tags: cuda_samples + tags: cuda_bandwidth tasks: - - ansible.builtin.import_role: - name: cuda - tasks_from: samples.yml - - name: Run CUDA bandwidth tasks ansible.builtin.import_role: name: cuda diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 14d3d90f7..a1a55deb8 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -16,6 +16,8 @@ cuda_samples_programs: - bandwidthTest # cuda_devices: # discovered from deviceQuery run cuda_persistenced_state: started +cuda_install_nvidiafabricmanger: false # variables for nvbandwidth (for bandwidth.yml tasks run in cudatests.yml) +cuda_bandwidth_version: '0.8' cuda_bandwidth_path: "/var/lib/{{ ansible_user }}/cuda_bandwidth" -cuda_bandwidth_release_url: "https://github.com/NVIDIA/nvbandwidth/archive/refs/tags/v0.8.tar.gz" +cuda_bandwidth_release_url: "https://github.com/NVIDIA/nvbandwidth/archive/refs/tags/v{{ cuda_bandwidth_version }}.tar.gz" diff --git a/ansible/roles/cuda/tasks/bandwidth.yml b/ansible/roles/cuda/tasks/bandwidth.yml index 0d18088f6..58f57dd73 100644 --- a/ansible/roles/cuda/tasks/bandwidth.yml +++ b/ansible/roles/cuda/tasks/bandwidth.yml @@ -1,5 +1,5 @@ --- -- name: Ensure cuda_bandwidth_path exists +- name: Ensure CUDA bandwidth path exists ansible.builtin.file: state: directory path: "{{ cuda_bandwidth_path }}" @@ -14,12 +14,12 @@ dest: "{{ cuda_bandwidth_path }}" owner: "{{ ansible_user }}" group: "{{ ansible_user }}" - creates: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8" + creates: "{{ cuda_bandwidth_path }}/nvbandwidth-{{ cuda_bandwidth_version }}" - name: Creates CUDA bandwidth test build directory ansible.builtin.file: state: directory - path: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build" + path: "{{ cuda_bandwidth_path }}/nvbandwidth-{{ cuda_bandwidth_version }}/build" mode: "0755" - name: Ensure cudatests directory exists @@ -31,26 +31,27 @@ - name: Build CUDA bandwidth test ansible.builtin.shell: - cmd: | + cmd: > source /cvmfs/software.eessi.io/versions/2023.06/init/bash && + module load buildenv/default-foss-2023b && module load Boost/1.82.0-GCC-12.3.0 && - . /etc/profile.d/sh.local && cmake .. && + . /etc/profile.d/sh.local && + cmake .. && make -j {{ ansible_processor_vcpus }} - chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build" - creates: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build/nvbandwidth" + chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-{{ cuda_bandwidth_version }}/build" + creates: "{{ cuda_bandwidth_path }}/nvbandwidth-{{ cuda_bandwidth_version }}/build/nvbandwidth" - name: Run CUDA bandwidth test ansible.builtin.shell: | - export LD_LIBRARY_PATH=/cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen4/software/GCCcore/12.3.0/lib64:\ - /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen4/software/Boost/1.82.0-GCC-12.3.0/lib ./nvbandwidth args: - chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build/" + chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-{{ cuda_bandwidth_version }}/build/" register: cuda_bandwidth_output + changed_when: true - name: Save CUDA bandwidth output to bandwidth_results.txt ansible.builtin.copy: content: "{{ cuda_bandwidth_output.stdout }}" - dest: "{{ appliances_environment_root }}/cudatests/bandwidth_results.txt" + dest: "{{ appliances_environment_root }}/cudatests/{{ inventory_hostname }}bandwidth_results.txt" mode: '0644' delegate_to: localhost From 5160952b87cadb8f25eed6ef6ea10457bb7daea8 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Tue, 4 Nov 2025 09:35:39 +0000 Subject: [PATCH 3/3] changing output file name --- ansible/roles/cuda/tasks/bandwidth.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/cuda/tasks/bandwidth.yml b/ansible/roles/cuda/tasks/bandwidth.yml index 58f57dd73..6711f88ac 100644 --- a/ansible/roles/cuda/tasks/bandwidth.yml +++ b/ansible/roles/cuda/tasks/bandwidth.yml @@ -52,6 +52,6 @@ - name: Save CUDA bandwidth output to bandwidth_results.txt ansible.builtin.copy: content: "{{ cuda_bandwidth_output.stdout }}" - dest: "{{ appliances_environment_root }}/cudatests/{{ inventory_hostname }}bandwidth_results.txt" + dest: "{{ appliances_environment_root }}/cudatests/nvbandwidth-{{ inventory_hostname }}.txt" mode: '0644' delegate_to: localhost