Merge branch 'worleyph/cime/taskmap' into master (PR #2458)

Currently MPI task to compute node mapping information is output in two locations, once in CAM, where it is truncated after the first 256 MPI tasks, and once in CLM, where it is truncated after the first 100 MPI tasks, both only for these two components. This is not useful in current production runs. The use of environment variables, such as MPICH_CPUMASK_DISPLAY on Cray systems, generate data that are unnecessarily verbose for our needs. Here a share routine is introduced that writes out one line per compute node. Each line contains the compute node name and the list of MPI tasks assigned to that node for a given communicator. This is then called in the driver and writes out the task-to-node mapping for the entire coupled model. Separate branches will then introduce this into the individual components, replacing the current logic in both CAM and CLM, for example. The share routine also optionally returns the number of compute nodes and the task-to-node mapping, which is needed in the internal CAM load balancing. With the call to the shr_taskmap_write routine in the driver, the mapping data generated by the system when setting the corresponding environment variable is redundant. This is removed for the systems currently setting the variable. Fixes #2457 BFB * origin/worleyph/cime/taskmap: Avoid empty env blocks Remove unnecessary white space in task-to-node map output Modify driver output format Uncomment MV2_CPU_MAPPING definition for Anvil Modify task map output format Unset environment variables to output task-to-node mapping Output MPI task to compute node mapping
ESMCI · Aug 7, 2018 · 1a752ab · 1a752ab
2 parents fccf2ec + a67734e
commit 1a752ab
Show file tree

Hide file tree

Showing 3 changed files with 353 additions and 13 deletions.
diff --git a/config/e3sm/machines/config_machines.xml b/config/e3sm/machines/config_machines.xml
@@ -183,7 +183,7 @@
   <environment_variables>
     <env name="MPICH_ENV_DISPLAY">1</env>
     <env name="MPICH_VERSION_DISPLAY">1</env>
-    <env name="MPICH_CPUMASK_DISPLAY">1</env>
+    <!--env name="MPICH_CPUMASK_DISPLAY">1</env-->
 
     <env name="OMP_STACKSIZE">64M</env>
     <env name="OMP_PROC_BIND">spread</env>
@@ -327,7 +327,7 @@
 
     <env name="MPICH_ENV_DISPLAY">1</env>
     <env name="MPICH_VERSION_DISPLAY">1</env>
-    <env name="MPICH_CPUMASK_DISPLAY">1</env>
+    <!--env name="MPICH_CPUMASK_DISPLAY">1</env-->
 
     <env name="OMP_STACKSIZE">128M</env>
     <env name="OMP_PROC_BIND">spread</env>
@@ -483,7 +483,7 @@
   <environment_variables>
     <env name="MPICH_ENV_DISPLAY">1</env>
     <env name="MPICH_VERSION_DISPLAY">1</env>
-    <env name="MPICH_CPUMASK_DISPLAY">1</env>
+    <!--env name="MPICH_CPUMASK_DISPLAY">1</env-->
 
     <env name="OMP_STACKSIZE">128M</env>
     <env name="OMP_PROC_BIND">spread</env>
@@ -1675,14 +1675,12 @@
     <environment_variables>
       <env name="MPICH_ENV_DISPLAY">1</env>
       <env name="MPICH_VERSION_DISPLAY">1</env>
+      <!--env name="MPICH_CPUMASK_DISPLAY">1</env-->
       <env name="MPAS_TOOL_DIR">/projects/ccsm/acme/tools/mpas</env>
       <env name="HDF5_DISABLE_VERSION_CHECK">2</env>
       <env name="labeling"> </env>
       <env name="SMP_VARS"> </env>
     </environment_variables>
-    <environment_variables SMP_PRESENT="TRUE">
-      <env name="MPICH_CPUMASK_DISPLAY">1</env>
-    </environment_variables>
     <environment_variables SMP_PRESENT="TRUE" compiler="intel">
       <env name="SMP_VARS">-e OMP_NUM_THREADS=$ENV{OMP_NUM_THREADS} -e OMP_STACKSIZE=128M -e KMP_AFFINITY=granularity=thread,scatter</env>
     </environment_variables>
@@ -2247,7 +2245,7 @@
         <env name="MPILIB">$MPILIB</env>
         <env name="MPICH_ENV_DISPLAY">1</env>
         <env name="MPICH_VERSION_DISPLAY">1</env>
-        <env name="MPICH_CPUMASK_DISPLAY">1</env>
+        <!--env name="MPICH_CPUMASK_DISPLAY">1</env-->
         <env name="MPSTKZ">128M</env>
         <env name="OMP_STACKSIZE">128M</env>
       </environment_variables>

diff --git a/src/drivers/mct/shr/seq_comm_mct.F90 b/src/drivers/mct/shr/seq_comm_mct.F90
@@ -16,12 +16,14 @@ module seq_comm_mct
 !!! the namelist).  ARE OTHER PROTECTIONS/CHECKS NEEDED???
 
 
-  use mct_mod     , only : mct_world_init, mct_world_clean, mct_die
-  use shr_sys_mod , only : shr_sys_abort, shr_sys_flush
-  use shr_mpi_mod , only : shr_mpi_chkerr, shr_mpi_bcast, shr_mpi_max
-  use shr_file_mod, only : shr_file_getUnit, shr_file_freeUnit
-  use esmf        , only : ESMF_LogKind_Flag, ESMF_LOGKIND_NONE
-  use esmf        , only : ESMF_LOGKIND_SINGLE, ESMF_LOGKIND_MULTI
+  use mct_mod        , only : mct_world_init, mct_world_clean, mct_die
+  use shr_sys_mod    , only : shr_sys_abort, shr_sys_flush
+  use shr_mpi_mod    , only : shr_mpi_chkerr, shr_mpi_bcast, shr_mpi_max
+  use shr_file_mod   , only : shr_file_getUnit, shr_file_freeUnit
+  use shr_taskmap_mod, only : shr_taskmap_write
+  use perf_mod       , only : t_startf, t_stopf
+  use esmf           , only : ESMF_LogKind_Flag, ESMF_LOGKIND_NONE
+  use esmf           , only : ESMF_LOGKIND_SINGLE, ESMF_LOGKIND_MULTI
 
   implicit none
 
@@ -222,6 +224,7 @@ subroutine seq_comm_init(global_comm_in, driver_comm_in, nmlfile, drv_comm_id)
     integer, pointer :: comps(:) ! array with component ids
     integer, pointer :: comms(:) ! array with mpicoms
     integer :: nu
+    character(len=8) :: c_global_numpes ! global number of pes
     character(len=seq_comm_namelen) :: valid_comps(ncomps)
 
     integer :: &
@@ -291,6 +294,17 @@ subroutine seq_comm_init(global_comm_in, driver_comm_in, nmlfile, drv_comm_id)
        call shr_sys_abort(trim(subname)//' ERROR decomposition error ')
     endif
 
+    ! output task-to-node mapping
+    if (mype == 0) then
+       write(c_global_numpes,'(i8)') global_numpes
+       write(logunit,100) trim(adjustl(c_global_numpes))
+100    format(/,a,' pes participating in computation of coupled model')
+       call shr_sys_flush(logunit)
+    endif
+    call t_startf("shr_taskmap_write")
+    call shr_taskmap_write(logunit, GLOBAL_COMM_IN, 'GLOBAL')
+    call t_stopf("shr_taskmap_write")
+
     ! Initialize gloiam on all IDs
 
     global_mype = mype