ESMCI · jedwards4b · Nov 1, 2022 · Nov 3, 2022 · Nov 3, 2022 · Nov 3, 2022
diff --git a/machines/cmake_macros/gust.cmake b/machines/cmake_macros/gust.cmake
@@ -5,7 +5,6 @@ set(NETCDF_PATH "$ENV{NETCDF}")
 set(PIO_FILESYSTEM_HINTS "lustre")
 set(PNETCDF_PATH "$ENV{PNETCDF}")
 # If we want to use cray-libsci instead of mkl uncomment this line as well as the module in config_machines.xml
-string(REPLACE "-mkl=cluster" "" SLIBS "${SLIBS}")
-#string(REPLACE "-mkl=cluster" "-qmkl=cluster" SLIBS "${SLIBS}")
+string(REPLACE "-mkl=cluster" "-qmkl=cluster" SLIBS "${SLIBS}")
 #string(APPEND CPPDEFS " -DNO_SHR_VMATH ")
 string(APPEND CPPDEFS " -DHAVE_GETTID")
diff --git a/machines/cmake_macros/nvhpc-gpu.cmake b/machines/cmake_macros/nvhpc-gpu.cmake
diff --git a/machines/cmake_macros/nvhpc-gpu_casper.cmake b/machines/cmake_macros/nvhpc-gpu_casper.cmake
diff --git a/machines/cmake_macros/nvhpc.cmake b/machines/cmake_macros/nvhpc.cmake
@@ -49,3 +49,22 @@ set(MPIFC "mpif90")
 set(SCC "nvc")
 set(SCXX "nvc++")
 set(SFC "nvfortran")
+if (GPU_TYPE STREQUAL v100 AND GPU_OFFLOAD STREQUAL openacc)
+   string(APPEND GPUFLAGS  " -acc -gpu=cc70,lineinfo,nofma -Minfo=accel ")
+endif()
+if (GPU_TYPE STREQUAL v100 AND GPU_OFFLOAD STREQUAL openmp)
+   string(APPEND GPUFLAGS  " -mp=gpu -gpu=cc70,lineinfo,nofma -Minfo=accel ")
+endif()
+if (GPU_TYPE STREQUAL v100 AND GPU_OFFLOAD STREQUAL combined)
+   string(APPEND GPUFLAGS  " -acc -gpu=cc70,lineinfo,nofma -mp=gpu -Minfo=accel ")
+endif()
+
+if (GPU_TYPE STREQUAL a100 AND GPU_OFFLOAD STREQUAL openacc)
+   string(APPEND GPUFLAGS  " -acc -gpu=cc80,lineinfo,nofma -Minfo=accel ")
+endif()
+if (GPU_TYPE STREQUAL a100 AND GPU_OFFLOAD STREQUAL openmp)
+   string(APPEND GPUFLAGS  " -mp=gpu -gpu=cc80,lineinfo,nofma -Minfo=accel ")
+endif()
+if (GPU_TYPE STREQUAL a100 AND GPU_OFFLOAD STREQUAL combined)
+   string(APPEND GPUFLAGS  " -acc -gpu=cc80,lineinfo,nofma -mp=gpu -Minfo=accel")
+endif()
diff --git a/machines/cmake_macros/nvhpc_casper.cmake b/machines/cmake_macros/nvhpc_casper.cmake
@@ -13,3 +13,4 @@ if (MPILIB STREQUAL mpi-serial)
   string(APPEND SLIBS " -ldl")
 endif()
 string(APPEND SLIBS " -L${NETCDF_PATH}/lib -lnetcdf -lnetcdff")
+message("GPU_TYPE is ${GPU_TYPE} GPU_OFFLOAD is ${GPU_OFFLOAD}")
diff --git a/machines/cmake_macros/pgi-gpu.cmake b/machines/cmake_macros/pgi-gpu.cmake
diff --git a/machines/cmake_macros/pgi-gpu_casper.cmake b/machines/cmake_macros/pgi-gpu_casper.cmake
diff --git a/machines/config_batch.xml b/machines/config_batch.xml
@@ -226,34 +226,14 @@
   <!-- casper pbs -->
   <batch_system MACH="casper" type="pbs">
     <batch_submit>qsub</batch_submit>
-
-    <directives queue="casper" compiler="pgi-gpu">
-      <!-- Turn on MPS server manually -->
-      <!-- This is a temporary solution and should be removed once MPS is integrated into PBS on Casper -->
-      <directive default="/bin/bash" > -S /glade/u/apps/dav/opt/nvidia-mps/mps_bash </directive>
-      <directive> -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=700GB:ngpus={{ ngpus_per_node }} </directive>
-      <directive> -l gpu_type=v100 </directive>
-    </directives>
-
-    <directives queue="casper" compiler="nvhpc-gpu">
-      <!-- Turn on MPS server manually -->
-      <!-- This is a temporary solution and should be removed once MPS is integrated into PBS on Casper -->
-      <directive default="/bin/bash" > -S /glade/u/apps/dav/opt/nvidia-mps/mps_bash </directive>
-      <directive> -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=700GB:ngpus={{ ngpus_per_node }} </directive>
-      <directive> -l gpu_type=v100 </directive>
-    </directives>
-
-    <directives queue="casper" compiler="nvhpc">
-      <directive default="/bin/bash" > -S {{ shell }} </directive>
-      <directive> -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=700GB:ngpus={{ ngpus_per_node }} </directive>
-    </directives>
-
-    <directives queue="casper" compiler="pgi">
+    <submit_args>
+      <argument> -l gpu_type=$GPU_TYPE </argument>
+    </submit_args>
+    <directives queue="casper" compiler="nvhpc" gpu_enabled="true">
       <directive default="/bin/bash" > -S {{ shell }} </directive>
-      <directive> -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=700GB:ngpus={{ ngpus_per_node }} </directive>
+      <directive> -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=700GB:ngpus={{ ngpus_per_node }}:mps=1 </directive>
     </directives>
-
-    <directives queue="casper" compiler="intel">
+    <directives queue="casper" gpu_enabled="false">
       <directive default="/bin/bash" > -S {{ shell }} </directive>
       <directive> -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=700GB:ngpus={{ ngpus_per_node }} </directive>
     </directives>
@@ -490,12 +470,21 @@
 
   <batch_system MACH="gust" type="pbs" >
     <batch_submit>qsub</batch_submit>
-    <directives>
+    <submit_args>
+      <argument> -l gpu_type=$GPU_TYPE </argument>
+    </submit_args>
+    <directives queue="main" gpu_enabled="false">
       <directive default="/bin/bash" > -S {{ shell }}  </directive>
       <directive> -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}</directive>
     </directives>
+    <directives queue="main" gpu_enabled="true">
+      <directive default="/bin/bash" > -S {{ shell }} </directive>
+      <directive> -l select={{ num_nodes }}:ncpus={{ max_cputasks_per_gpu_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=430GB:ngpus={{ ngpus_per_node }} </directive>
+      <!-- Do not use MPS on Gust yet as it conflicts with cray-mpich library -->
+      <!-- <directive> -l select={{ num_nodes }}:ncpus={{ max_cputasks_per_gpu_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=430GB:ngpus={{ ngpus_per_node }}:mps=1 </directive> -->
+    </directives>
     <queues>
-      <queue walltimemax="4:00:00" nodemin="1" nodemax="8" >main</queue>
+      <queue walltimemax="2:00:00" nodemin="1" nodemax="8" >main</queue>
       <queue walltimemax="1:00:00" nodemin="9" nodemax="16" >bigcpu</queue>
     </queues>
   </batch_system>

diff --git a/machines/config_machines.xml b/machines/config_machines.xml
@@ -410,19 +410,22 @@ This allows using a different mpirun command to launch unit tests
     <DESC>NCAR GPU platform, os is Linux, 36 pes/node, batch system is pbs</DESC>
     <NODENAME_REGEX>casper*</NODENAME_REGEX>
     <OS>LINUX</OS>
-    <COMPILERS>pgi,intel,nvhpc,pgi-gpu,nvhpc-gpu</COMPILERS>
+    <COMPILERS>nvhpc,intel</COMPILERS>
     <MPILIBS>openmpi</MPILIBS>
     <CIME_OUTPUT_ROOT>/glade/scratch/$USER</CIME_OUTPUT_ROOT>
     <DIN_LOC_ROOT>$ENV{CESMDATAROOT}/inputdata</DIN_LOC_ROOT>
     <DIN_LOC_ROOT_CLMFORC>/glade/p/cgd/tss/CTSM_datm_forcing_data</DIN_LOC_ROOT_CLMFORC>
     <DOUT_S_ROOT>$CIME_OUTPUT_ROOT/archive/$CASE</DOUT_S_ROOT>
     <BASELINE_ROOT>$ENV{CESMDATAROOT}/cesm_baselines</BASELINE_ROOT>
+    <CCSM_CPRNC>$ENV{CESMDATAROOT}/tools/cime/tools/cprnc/cprnc</CCSM_CPRNC>
     <GMAKE_J>8</GMAKE_J>
     <BATCH_SYSTEM>pbs</BATCH_SYSTEM>
     <SUPPORTED_BY>ASAP/CISL</SUPPORTED_BY>
     <MAX_TASKS_PER_NODE>36</MAX_TASKS_PER_NODE>
-    <MAX_GPUS_PER_NODE>8</MAX_GPUS_PER_NODE>
+    <MAX_GPUS_PER_NODE compiler="nvhpc">8</MAX_GPUS_PER_NODE>
     <MAX_MPITASKS_PER_NODE>36</MAX_MPITASKS_PER_NODE>
+    <MAX_CPUTASKS_PER_GPU_NODE>36</MAX_CPUTASKS_PER_GPU_NODE>
+    <GPU_TYPES>v100,a100</GPU_TYPES>
     <PROJECT_REQUIRED>TRUE</PROJECT_REQUIRED>
     <mpirun mpilib="default">
       <executable>mpirun</executable>
@@ -450,54 +453,22 @@ This allows using a different mpirun command to launch unit tests
         <command name="load">ncarenv/1.3</command>
         <command name="load">cmake/3.18.2</command>
       </modules>
-      <modules compiler="pgi">
-        <command name="load">pgi/20.4</command>
-      </modules>
-      <modules compiler="pgi-gpu">
-        <command name="load">pgi/20.4</command>
-      </modules>
       <modules compiler="nvhpc">
         <command name="load">nvhpc/22.2</command>
       </modules>
-      <modules compiler="nvhpc-gpu">
-        <command name="load">nvhpc/22.2</command>
-      </modules>
       <modules compiler="intel">
         <command name="load">intel/19.1.1</command>
         <command name="load">mkl/2020.0.1</command>
       </modules>
-      <modules mpilib="openmpi" compiler="pgi">
-        <command name="load">openmpi/4.1.0</command>
-        <command name="load">netcdf-mpi/4.8.0</command>
-        <command name="load">pnetcdf/1.12.2</command>
-      </modules>
-      <modules mpilib="mpi-serial" compiler="pgi">
-        <command name="load">netcdf/4.8.0</command>
-      </modules>
-      <modules mpilib="openmpi" compiler="pgi-gpu">
-        <command name="load">openmpi/4.1.0</command>
-        <command name="load">netcdf-mpi/4.7.4</command>
-        <command name="load">pnetcdf/1.12.2</command>
-        <command name="load">cuda/11.0.3</command>
-      </modules>
-      <modules mpilib="mpi-serial" compiler="pgi-gpu">
-        <command name="load">netcdf/4.7.4</command>
-      </modules>
       <modules mpilib="openmpi" compiler="nvhpc">
         <command name="load">openmpi/4.1.4</command>
         <command name="load">netcdf-mpi/4.8.1</command>
         <command name="load">pnetcdf/1.12.3</command>
       </modules>
-      <modules mpilib="mpi-serial" compiler="nvhpc">
-        <command name="load">netcdf/4.8.1</command>
-      </modules>
-      <modules mpilib="openmpi" compiler="nvhpc-gpu">
-        <command name="load">openmpi/4.1.4</command>
-        <command name="load">netcdf-mpi/4.8.1</command>
-        <command name="load">pnetcdf/1.12.3</command>
-        <command name="load">cuda/11.4.0</command>
+      <modules gpu_type="!none">
+        <command name="load">cuda/11.6</command>
       </modules>
-      <modules mpilib="mpi-serial" compiler="nvhpc-gpu">
+      <modules mpilib="mpi-serial" compiler="nvhpc">
         <command name="load">netcdf/4.8.1</command>
       </modules>
       <modules mpilib="openmpi" compiler="intel">
@@ -517,29 +488,21 @@ This allows using a different mpirun command to launch unit tests
         <command name="use">/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/</command>
         <command name="load">esmf-8.4.0b08_casper-ncdfio-openmpi-O</command>
       </modules>
-      <modules compiler="nvhpc-gpu" mpilib="openmpi" DEBUG="TRUE">
+      <modules compiler="nvhpc" mpilib="openmpi" DEBUG="TRUE">
         <command name="use">/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/nvhpc/22.2/</command>
-        <command name="load">esmf-8.4.1b01-ncdfio-openmpi-g</command>
+        <command name="load">esmf-8.4.1_casper-ncdfio-openmpi-g</command>
       </modules>
-      <modules compiler="nvhpc-gpu" mpilib="openmpi" DEBUG="FALSE">
+      <modules compiler="nvhpc" mpilib="openmpi" DEBUG="FALSE">
         <command name="use">/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/nvhpc/22.2/</command>
-        <command name="load">esmf-8.4.1b01-ncdfio-openmpi-O</command>
-      </modules>
-      <modules compiler="pgi" mpilib="openmpi" DEBUG="TRUE">
-        <command name="use">/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/</command>
-        <command name="load">esmf-8.4.0b08_casper-ncdfio-openmpi-g</command>
-      </modules>
-      <modules compiler="pgi" mpilib="openmpi" DEBUG="FALSE">
-        <command name="use">/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/</command>
-        <command name="load">esmf-8.2.0b11_casper-ncdfio-openmpi-O</command>
+        <command name="load">esmf-8.4.1_casper-ncdfio-openmpi-O</command>
       </modules>
       <modules>
         <command name="load">ncarcompilers/0.5.0</command>
       </modules>
-      <modules compiler="!pgi" DEBUG="FALSE" mpilib="openmpi">
+      <modules DEBUG="FALSE" mpilib="openmpi">
         <command name="load">pio/2.5.10</command>
       </modules>
-      <modules compiler="!pgi" DEBUG="TRUE" mpilib="openmpi">
+      <modules DEBUG="TRUE" mpilib="openmpi">
         <command name="load">pio/2.5.10d</command>
       </modules>
     </module_system>
@@ -580,7 +543,7 @@ This allows using a different mpirun command to launch unit tests
     <DIN_LOC_ROOT_CLMFORC>/glade/p/cgd/tss/CTSM_datm_forcing_data</DIN_LOC_ROOT_CLMFORC>
     <DOUT_S_ROOT>$CIME_OUTPUT_ROOT/archive/$CASE</DOUT_S_ROOT>
     <BASELINE_ROOT>$ENV{CESMDATAROOT}/cesm_baselines</BASELINE_ROOT>
-    <CCSM_CPRNC>$ENV{CESMDATAROOT}/tools/cime/tools/cprnc/cprnc.cheyenne</CCSM_CPRNC>
+    <CCSM_CPRNC>$ENV{CESMDATAROOT}/tools/cime/tools/cprnc/cprnc</CCSM_CPRNC>
     <GMAKE_J>8</GMAKE_J>
     <BATCH_SYSTEM>pbs</BATCH_SYSTEM>
     <SUPPORTED_BY>cseg</SUPPORTED_BY>
@@ -1850,7 +1813,11 @@ This allows using a different mpirun command to launch unit tests
     <BATCH_SYSTEM>pbs</BATCH_SYSTEM>
     <SUPPORTED_BY>cseg</SUPPORTED_BY>
     <MAX_TASKS_PER_NODE>128</MAX_TASKS_PER_NODE>
+    <MAX_GPUS_PER_NODE>4</MAX_GPUS_PER_NODE>
     <MAX_MPITASKS_PER_NODE>128</MAX_MPITASKS_PER_NODE>
+    <MAX_CPUTASKS_PER_GPU_NODE>64</MAX_CPUTASKS_PER_GPU_NODE>
+    <GPU_TYPES>a100</GPU_TYPES>
+    <GPU_OFFLOAD>openacc,openmp,combined</GPU_OFFLOAD>
     <PROJECT_REQUIRED>TRUE</PROJECT_REQUIRED>
     <mpirun mpilib="default">
       <executable>mpiexec</executable>
@@ -1907,11 +1874,12 @@ This allows using a different mpirun command to launch unit tests
       <modules mpilib="mpi-serial">
         <command name="load">mpi-serial/2.3.0</command>
       </modules>
-
+      <modules gpu_type="!none">
+        <command name="load">cuda/11.7.1</command>
+      </modules>
       <modules mpilib="mpi-serial">
         <command name="load">netcdf/4.9.1</command>
       </modules>
-
       <modules mpilib="!mpi-serial">
         <command name="load">netcdf-mpi/4.9.1</command>
         <command name="load">parallel-netcdf/1.12.3</command>

diff --git a/machines/mpi_run_gpu.casper b/machines/mpi_run_gpu.casper