Skip to content

Commit

Permalink
Merge pull request #3791 from ESMCI/jgfouca/revert_mem-usage-logging
Browse files Browse the repository at this point in the history
Revert "Merge pull request #3788 from ESMCI/azamat/driver/mem-usage-logging"

This reverts commit 1922e71, reversing
changes made to 92e1294.

[ Description of the changes in this Pull Request. It should be enough
information for someone not following this development to understand.
Lines should be wrapped at about 72 characters. Please also update
the CIME documentation, if necessary, in doc/source/rst and indicate
below if you need to have the gh-pages html regenerated.]

Test suite:
Test baseline:
Test namelist changes:
Test status: [bit for bit, roundoff, climate changing]

Fixes [CIME Github issue #]

User interface changes?:

Update gh-pages html (Y/N)?:

Code review:
  • Loading branch information
jgfouca authored Dec 9, 2020
2 parents 21e60b6 + 2a1ea54 commit f9fcf34
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 250 deletions.
21 changes: 0 additions & 21 deletions driver-mct/cime_config/config_component_cesm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -69,27 +69,6 @@
(0: no output; 1: compact; 2: verbose).</desc>
</entry>

<entry id="INFO_MPROF">
<type>integer</type>
<valid_values>0,1,2,3</valid_values>
<default_value>2</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>Sets level of memory profile logging:
0: log mem-usage from component ROOTPE tasks
1: log mem-usage from all tasks
2: aggregate logging to node-level mem-usage on ROOTPE nodes
3: aggregate logging to node-level mem-usage on all nodes</desc>
</entry>

<entry id="INFO_MPROF_DT">
<type>integer</type>
<default_value>86400</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>number of seconds between memory profiling logs</desc>
</entry>

<entry id="SAVE_TIMING">
<type>logical</type>
<valid_values>TRUE,FALSE</valid_values>
Expand Down
21 changes: 0 additions & 21 deletions driver-mct/cime_config/config_component_e3sm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,27 +40,6 @@
(0: no output; 1: compact; 2: verbose).</desc>
</entry>

<entry id="INFO_MPROF">
<type>integer</type>
<valid_values>0,1,2,3</valid_values>
<default_value>2</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>Sets level of memory profile logging:
0: log mem-usage from component ROOTPE tasks
1: log mem-usage from all tasks
2: aggregate logging to node-level mem-usage on ROOTPE nodes
3: aggregate logging to node-level mem-usage on all nodes</desc>
</entry>

<entry id="INFO_MPROF_DT">
<type>integer</type>
<default_value>86400</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>number of seconds between memory profiling logs</desc>
</entry>

<entry id="SAVE_TIMING">
<type>logical</type>
<valid_values>TRUE,FALSE</valid_values>
Expand Down
28 changes: 0 additions & 28 deletions driver-mct/cime_config/namelist_definition_drv.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3084,34 +3084,6 @@
</values>
</entry>

<entry id="info_mprof">
<type>integer</type>
<category>cime_pes</category>
<group>cime_pes</group>
<desc>
Sets level of memory profile logging:
0: log mem-usage from component ROOTPE tasks
1: log mem-usage from all tasks
2: aggregate logging to node-level mem-usage on ROOTPE nodes
3: aggregate logging to node-level mem-usage on all nodes
</desc>
<values>
<value>$INFO_MPROF</value>
</values>
</entry>

<entry id="info_mprof_dt">
<type>integer</type>
<category>cime_pes</category>
<group>cime_pes</group>
<desc>
number of seconds between memory profiling logs
</desc>
<values>
<value>$INFO_MPROF_DT</value>
</values>
</entry>

<!-- =========================== -->
<!-- group prof_inparm -->
<!-- in perf_mod.F90 -->
Expand Down
172 changes: 9 additions & 163 deletions driver-mct/main/cime_comp_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ module cime_comp_mod
!----------------------------------------------------------------------------

! mpi comm data & routines, plus logunit and loglevel
use seq_comm_mct, only: CPLID, GLOID, logunit, loglevel, info_taskmap_comp, info_mprof, info_mprof_dt
use seq_comm_mct, only: CPLID, GLOID, logunit, loglevel, info_taskmap_comp
use seq_comm_mct, only: ATMID, LNDID, OCNID, ICEID, GLCID, ROFID, WAVID, ESPID
use seq_comm_mct, only: ALLATMID,ALLLNDID,ALLOCNID,ALLICEID,ALLGLCID,ALLROFID,ALLWAVID,ALLESPID
use seq_comm_mct, only: CPLALLATMID,CPLALLLNDID,CPLALLOCNID,CPLALLICEID
Expand All @@ -77,9 +77,8 @@ module cime_comp_mod
use seq_comm_mct, only: num_inst_total, num_inst_max
use seq_comm_mct, only: seq_comm_iamin, seq_comm_name, seq_comm_namelen
use seq_comm_mct, only: seq_comm_init, seq_comm_setnthreads, seq_comm_getnthreads
use seq_comm_mct, only: seq_comm_getinfo => seq_comm_setptrs, seq_comm_gloroot
use seq_comm_mct, only: seq_comm_getinfo => seq_comm_setptrs
use seq_comm_mct, only: cpl_inst_tag
use seq_comm_mct, only: driver_nnodes, driver_task_node_map

! clock & alarm routines and variables
use seq_timemgr_mod, only: seq_timemgr_type
Expand Down Expand Up @@ -560,9 +559,6 @@ module cime_comp_mod
!----------------------------------------------------------------------------
real(r8) :: msize,msize0,msize1 ! memory size (high water)
real(r8) :: mrss ,mrss0 ,mrss1 ! resident size (current memory use)
real(r8),allocatable :: msizeOnTask(:),mrssOnTask(:) ! msize,mrss on each MPI task
real(r8),allocatable :: msizeOnNode(:),mrssOnNode(:) ! msize,mrss on each node
integer :: mlog

!----------------------------------------------------------------------------
! threading control
Expand Down Expand Up @@ -601,7 +597,6 @@ module cime_comp_mod
integer :: mpicom_CPLALLIACID ! MPI comm for CPLALLIACID

integer :: iam_GLOID ! pe number in global id
integer :: npes_GLOID ! global number of pes
logical :: iamin_CPLID ! pe associated with CPLID
logical :: iamroot_GLOID ! GLOID masterproc
logical :: iamroot_CPLID ! CPLID masterproc
Expand All @@ -615,8 +610,6 @@ module cime_comp_mod
logical :: iamin_CPLALLWAVID ! pe associated with CPLALLWAVID
logical :: iamin_CPLALLIACID ! pe associated with CPLALLIACID

integer :: atm_rootpe,lnd_rootpe,ice_rootpe,ocn_rootpe,&
glc_rootpe,rof_rootpe,wav_rootpe,iac_rootpe

!----------------------------------------------------------------------------
! complist: list of comps on this pe
Expand Down Expand Up @@ -724,7 +717,7 @@ subroutine cime_pre_init1(esmf_log_option)
end if

!--- set task based threading counts ---
call seq_comm_getinfo(GLOID,pethreads=pethreads_GLOID,iam=iam_GLOID,npes=npes_GLOID)
call seq_comm_getinfo(GLOID,pethreads=pethreads_GLOID,iam=iam_GLOID)
call seq_comm_setnthreads(pethreads_GLOID)

!--- get some general data ---
Expand All @@ -744,15 +737,6 @@ subroutine cime_pre_init1(esmf_log_option)
comp_iamin(it) = seq_comm_iamin(comp_id(it))
comp_name(it) = seq_comm_name(comp_id(it))

atm_rootpe = seq_comm_gloroot(ALLATMID)
lnd_rootpe = seq_comm_gloroot(ALLLNDID)
ice_rootpe = seq_comm_gloroot(ALLICEID)
ocn_rootpe = seq_comm_gloroot(ALLOCNID)
glc_rootpe = seq_comm_gloroot(ALLGLCID)
rof_rootpe = seq_comm_gloroot(ALLROFID)
wav_rootpe = seq_comm_gloroot(ALLWAVID)
iac_rootpe = seq_comm_gloroot(ALLIACID)

do eai = 1,num_inst_atm
it=it+1
comp_id(it) = ATMID(eai)
Expand Down Expand Up @@ -1519,13 +1503,6 @@ subroutine cime_init()
complist = trim(complist)//' '//trim(compname)
endif
enddo
do eri = 1,num_inst_rof
iamin_ID = component_get_iamin_compid(rof(eri))
if (iamin_ID) then
compname = component_get_name(rof(eri))
complist = trim(complist)//' '//trim(compname)
endif
enddo
do ewi = 1,num_inst_wav
iamin_ID = component_get_iamin_compid(wav(ewi))
if (iamin_ID) then
Expand Down Expand Up @@ -2419,9 +2396,6 @@ subroutine cime_run()
real(r8) :: tbnds1_offset ! Time offset for call to seq_hist_writeaux
logical :: lnd2glc_averaged_now ! Whether lnd2glc averages were taken this timestep
logical :: prep_glc_accum_avg_called ! Whether prep_glc_accum_avg has been called this timestep
integer :: i, nodeId
character(len=15) :: c_ymdtod
character(len=18) :: c_mprof_file

101 format( A, i10.8, i8, 12A, A, F8.2, A, F8.2 )
102 format( A, i10.8, i8, A, 8L3 )
Expand Down Expand Up @@ -2458,92 +2432,21 @@ subroutine cime_run()
call seq_timemgr_EClockGetData( EClock_d, curr_ymd=ymd, curr_tod=tod)
#ifndef CPL_BYPASS
! Report on memory usage
call shr_mem_getusage(msize,mrss)

allocate( msizeOnTask(0:npes_GLOID-1), mrssOnTask(0:npes_GLOID-1), stat=ierr)
if (ierr /= 0) call shr_sys_abort('cime_run: allocate msizeOnTask,mrssOnTask failed')
allocate( msizeOnNode(0:driver_nnodes-1), mrssOnNode(0:driver_nnodes-1), stat=ierr)
if (ierr /= 0) call shr_sys_abort('cime_run: allocate msizeOnNode,mrssOnNode failed')

! log from cpl_rootpe only, so gather from all tasks
msizeOnTask(:) = -1
mrssOnTask(:) = -1
call mpi_gather (msize, 1, mpi_real8, &
msizeOnTask, 1, mpi_real8, &
0, mpicom_GLOID, ierr)
call mpi_gather (mrss, 1, mpi_real8, &
mrssOnTask, 1, mpi_real8, &
0, mpicom_GLOID, ierr)

! aggregate task-level to node-level mem-usage
msizeOnNode(:) = 0
mrssOnNode(:) = 0
do i=0,npes_GLOID-1
nodeId = driver_task_node_map(i)
msizeOnNode(nodeId) = msizeOnNode(nodeId) + msizeOnTask(i)
mrssOnNode(nodeId) = mrssOnNode(nodeId) + mrssOnTask(i)
enddo

! (For now, just look at the first instance of each component)
if ( iamroot_CPLID .or. &
ocn(ens1)%iamroot_compid .or. &
atm(ens1)%iamroot_compid .or. &
lnd(ens1)%iamroot_compid .or. &
ice(ens1)%iamroot_compid .or. &
glc(ens1)%iamroot_compid .or. &
rof(ens1)%iamroot_compid .or. &
wav(ens1)%iamroot_compid .or. &
iac(ens1)%iamroot_compid) then
call shr_mem_getusage(msize,mrss,.true.)

write(logunit,105) ' memory_write: model date = ',ymd,tod, &
' memory = ',msize,' MB (highwater) ',mrss,' MB (usage)', &
' (pe=',iam_GLOID,' comps=',trim(complist)//')'
endif
! write memory highwater and usage to standalone file
if ( iamroot_CPLID) then
mlog = shr_file_getUnit()
! log-name: memory.{0,1,2,3}.{nsecs}.log
write(c_mprof_file,'(a7,i1,a1,i0,a4)') 'memory.',info_mprof,'.',info_mprof_dt,'.log'
inquire(file=trim(c_mprof_file),exist=exists)
if (exists) then
open(mlog, file=trim(c_mprof_file), status='old', position='append')
else
open(mlog, file=trim(c_mprof_file), status='new', position='append')
endif

! log memory highwater and usage
write(c_ymdtod,'(f14.5)') ymd+tod/86400.
if (info_mprof == 1) then ! log each task
!---YMMDD.HHMMSS,--1234.567,--1234.567, msize,mrss (in MB) for each task
write(mlog,'(a15,a,*(f10.3,:,","))') c_ymdtod,",",(msizeOnTask(i),mrssOnTask(i),i=0,npes_GLOID-1)
else if (info_mprof == 0) then ! log ROOTPE tasks only
write(mlog,'(a15,a,*(f10.3,:,","))') c_ymdtod,",", &
(/msizeOnTask(iam_GLOID), mrssOnTask(iam_GLOID), &
msizeOnTask(atm_rootpe),mrssOnTask(atm_rootpe),&
msizeOnTask(lnd_rootpe),mrssOnTask(lnd_rootpe),&
msizeOnTask(ice_rootpe),mrssOnTask(ice_rootpe),&
msizeOnTask(ocn_rootpe),mrssOnTask(ocn_rootpe),&
msizeOnTask(glc_rootpe),mrssOnTask(glc_rootpe),&
msizeOnTask(rof_rootpe),mrssOnTask(rof_rootpe),&
msizeOnTask(wav_rootpe),mrssOnTask(wav_rootpe),&
msizeOnTask(iac_rootpe),mrssOnTask(iac_rootpe)/)
else if (info_mprof == 3) then ! log each node
write(mlog,'(a15,a,*(f10.3,:,","))') c_ymdtod,",",(msizeOnNode(i),mrssOnNode(i),i=0,driver_nnodes-1)
else if (info_mprof == 2) then ! log ROOTPE nodes
write(mlog,'(a15,a,*(f10.3,:,","))') c_ymdtod,",", &
(/msizeOnNode(driver_task_node_map(iam_GLOID)), mrssOnNode(driver_task_node_map(iam_GLOID)), &
msizeOnNode(driver_task_node_map(atm_rootpe)),mrssOnNode(driver_task_node_map(atm_rootpe)),&
msizeOnNode(driver_task_node_map(lnd_rootpe)),mrssOnNode(driver_task_node_map(lnd_rootpe)),&
msizeOnNode(driver_task_node_map(ice_rootpe)),mrssOnNode(driver_task_node_map(ice_rootpe)),&
msizeOnNode(driver_task_node_map(ocn_rootpe)),mrssOnNode(driver_task_node_map(ocn_rootpe)),&
msizeOnNode(driver_task_node_map(glc_rootpe)),mrssOnNode(driver_task_node_map(glc_rootpe)),&
msizeOnNode(driver_task_node_map(rof_rootpe)),mrssOnNode(driver_task_node_map(rof_rootpe)),&
msizeOnNode(driver_task_node_map(wav_rootpe)),mrssOnNode(driver_task_node_map(wav_rootpe)),&
msizeOnNode(driver_task_node_map(iac_rootpe)),mrssOnNode(driver_task_node_map(iac_rootpe))/)
else
write(logunit,*) "cime_run: valid info_mprof values:0,1,10,11, given:",info_mprof
endif
endif
#endif
! Write out a timing file checkpoint
write(timing_file,'(a,i8.8,a1,i5.5)') &
Expand Down Expand Up @@ -3422,78 +3325,24 @@ subroutine cime_run()
endif
endif
#ifndef CPL_BYPASS
if (tod == 0 .or. info_debug > 1 .or. (mod(tod, info_mprof_dt) == 0)) then

if (tod == 0 .or. info_debug > 1) then
!! Report on memory usage
call shr_mem_getusage(msize,mrss)

call mpi_gather (msize, 1, mpi_real8, &
msizeOnTask, 1, mpi_real8, &
0, mpicom_GLOID, ierr)
call mpi_gather (mrss, 1, mpi_real8, &
mrssOnTask, 1, mpi_real8, &
0, mpicom_GLOID, ierr)

! aggregate task-level to node-level mem-usage
msizeOnNode(:) = 0
mrssOnNode(:) = 0
do i=0,npes_GLOID-1
nodeId = driver_task_node_map(i)
msizeOnNode(nodeId) = msizeOnNode(nodeId) + msizeOnTask(i)
mrssOnNode(nodeId) = mrssOnNode(nodeId) + mrssOnTask(i)
enddo

!! For now, just look at the first instance of each component
if ((tod == 0 .or. info_debug > 1) .and. &
(iamroot_CPLID .or. &
if ( iamroot_CPLID .or. &
ocn(ens1)%iamroot_compid .or. &
atm(ens1)%iamroot_compid .or. &
lnd(ens1)%iamroot_compid .or. &
ice(ens1)%iamroot_compid .or. &
glc(ens1)%iamroot_compid .or. &
wav(ens1)%iamroot_compid .or. &
rof(ens1)%iamroot_compid .or. &
iac(ens1)%iamroot_compid)) then
iac(ens1)%iamroot_compid) then
call shr_mem_getusage(msize,mrss,.true.)

write(logunit,105) ' memory_write: model date = ',ymd,tod, &
' memory = ',msize,' MB (highwater) ',mrss,' MB (usage)', &
' (pe=',iam_GLOID,' comps=',trim(complist)//')'
endif
if (iamroot_CPLID) then
! log memory highwater and usage
write(c_ymdtod,'(f14.5)') ymd+tod/86400.
if (info_mprof == 1) then ! log each task
!---YMMDD.HHMMSS,--1234.567,--1234.567, msize,mrss (in MB) for each task
write(mlog,'(a15,a,*(f10.3,:,","))') c_ymdtod,",",(msizeOnTask(i),mrssOnTask(i),i=0,npes_GLOID-1)
else if (info_mprof == 0) then ! ROOTPEs only
write(mlog,'(a15,a,*(f10.3,:,","))') c_ymdtod,",",&
(/msizeOnTask(iam_GLOID), mrssOnTask(iam_GLOID), &
msizeOnTask(atm_rootpe),mrssOnTask(atm_rootpe), &
msizeOnTask(lnd_rootpe),mrssOnTask(lnd_rootpe), &
msizeOnTask(ice_rootpe),mrssOnTask(ice_rootpe), &
msizeOnTask(ocn_rootpe),mrssOnTask(ocn_rootpe), &
msizeOnTask(glc_rootpe),mrssOnTask(glc_rootpe), &
msizeOnTask(rof_rootpe),mrssOnTask(rof_rootpe), &
msizeOnTask(wav_rootpe),mrssOnTask(wav_rootpe), &
msizeOnTask(iac_rootpe),mrssOnTask(iac_rootpe)/)
else if (info_mprof == 3) then ! log each node
write(mlog,'(a15,a,*(f10.3,:,","))') c_ymdtod,",",(msizeOnNode(i),mrssOnNode(i),i=0,driver_nnodes-1)
else if (info_mprof == 2) then ! log ROOTPE nodes
write(mlog,'(a15,a,*(f10.3,:,","))') c_ymdtod,",", &
(/msizeOnNode(driver_task_node_map(iam_GLOID)),mrssOnNode(driver_task_node_map(iam_GLOID)), &
msizeOnNode(driver_task_node_map(atm_rootpe)),mrssOnNode(driver_task_node_map(atm_rootpe)),&
msizeOnNode(driver_task_node_map(lnd_rootpe)),mrssOnNode(driver_task_node_map(lnd_rootpe)),&
msizeOnNode(driver_task_node_map(ice_rootpe)),mrssOnNode(driver_task_node_map(ice_rootpe)),&
msizeOnNode(driver_task_node_map(ocn_rootpe)),mrssOnNode(driver_task_node_map(ocn_rootpe)),&
msizeOnNode(driver_task_node_map(glc_rootpe)),mrssOnNode(driver_task_node_map(glc_rootpe)),&
msizeOnNode(driver_task_node_map(rof_rootpe)),mrssOnNode(driver_task_node_map(rof_rootpe)),&
msizeOnNode(driver_task_node_map(wav_rootpe)),mrssOnNode(driver_task_node_map(wav_rootpe)),&
msizeOnNode(driver_task_node_map(iac_rootpe)),mrssOnNode(driver_task_node_map(iac_rootpe))/)
else
write(logunit,*) "cime_run: valid info_mprof values:0,1,2,3, given:",info_mprof
endif
endif ! iamroot_CPLID
endif ! tod == 0
endif
#endif
if (info_debug > 1) then
if (iamroot_CPLID) then
Expand Down Expand Up @@ -3607,10 +3456,7 @@ subroutine cime_final()
write(logunit,FormatR) subname,' pes max memory last usage (MB) = ',mrss1
write(logunit,'(//)')
close(logunit)
close(mlog)
call shr_file_freeUnit(mlog)
endif
deallocate(msizeOnTask,mrssOnTask,msizeOnNode,mrssOnNode)

call t_adj_detailf(-1)
call t_stopf('CPL:cime_final')
Expand Down
Loading

0 comments on commit f9fcf34

Please sign in to comment.