Skip to content

Commit

Permalink
Enable optional checkpointing of timing data during initialization
Browse files Browse the repository at this point in the history
Currently a timing checkpoint is saved at the end of the first
simulation day, capturing both initialization cost and possibily
anomalous first simulation day performance. However, when
evaluating performance of initialization for high resolution
cases, this may not be sufficiently fine grain, especially if the
evaluation job runs out of time before completing the first
simulation day. Here the env_run.xml TPROF_IN_INIT variable is
introduced that enables saving timing checkpoints at
two locations within the initialization. (This adds a new namelist
variable, to seq_infodata_inparm in drv_in.) TPROF_IN_INIT is FALSE
by default, in which case no additional checkpoint data is output.
However, existing timer names associated with the writing out of
performance data are modified slightly, to better attribute time
associated with the new performance data writes.

[BFB] - Bit-For-Bit
[NML] - Namelist Changing
  • Loading branch information
Patrick Worley committed Apr 25, 2019
1 parent 8eb6019 commit 7b1e744
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 22 deletions.
9 changes: 9 additions & 0 deletions cime/src/drivers/mct/cime_config/config_component_cesm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@
</desc>
</entry>

<entry id="TPROF_IN_INIT">
<type>logical</type>
<valid_values>TRUE,FALSE</valid_values>
<default_value>FALSE</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>Turns on checkpointing of timing data during initialization</desc>
</entry>

<entry id="TIMER_DETAIL">
<type>integer</type>
<default_value>2</default_value>
Expand Down
13 changes: 12 additions & 1 deletion cime/src/drivers/mct/cime_config/config_component_e3sm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,18 @@
<default_value>12</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>timer detail FIXME - add documentation</desc>
<desc>Sets maximum number of run loop timing data checkpoints.
This sets values for tprof_option and tprof_n that determine the timing output file frequency.
</desc>
</entry>

<entry id="TPROF_IN_INIT">
<type>logical</type>
<valid_values>TRUE,FALSE</valid_values>
<default_value>FALSE</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>Turns on checkpointing of timing data during initialization</desc>
</entry>

<entry id="DOUT_S_SAVE_INTERIM_RESTART_FILES">
Expand Down
12 changes: 12 additions & 0 deletions cime/src/drivers/mct/cime_config/namelist_definition_drv.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2109,6 +2109,18 @@
</values>
</entry>

<entry id="tprof_in_init" modify_via_xml="TPROF_IN_INIT">
<type>logical</type>
<category>performance</category>
<group>seq_infodata_inparm</group>
<desc>
Turns on checkpointing of timing data during initialization
</desc>
<values>
<value>$TPROF_IN_INIT</value>
</values>
</entry>

<entry id="pause_option" modify_via_xml="PAUSE_OPTION">
<type>char</type>
<category>time</category>
Expand Down
91 changes: 70 additions & 21 deletions cime/src/drivers/mct/main/cime_comp_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ module cime_comp_mod
private :: cime_run_calc_budgets3
private :: cime_run_write_history
private :: cime_run_write_restart
private :: cime_write_performance_checkpoint

#include <mpif.h>

Expand Down Expand Up @@ -461,6 +462,8 @@ module cime_comp_mod
real(r8) :: reprosum_diffmax ! setup reprosum, set rel_diff_max
logical :: reprosum_recompute ! setup reprosum, recompute if tolerance exceeded

logical :: tprof_in_init=.false. ! write performance data checkpoints during initialization

logical :: output_perf = .false. ! require timing data output for this pe
logical :: in_first_day = .true. ! currently simulating first day

Expand Down Expand Up @@ -1042,6 +1045,7 @@ subroutine cime_pre_init2()
reprosum_allow_infnan=reprosum_allow_infnan, &
reprosum_diffmax=reprosum_diffmax , &
reprosum_recompute=reprosum_recompute, &
tprof_in_init=tprof_in_init , &
max_cplstep_time=max_cplstep_time)

! above - cpl_decomp is set to pass the cpl_decomp value to seq_mctext_decomp
Expand Down Expand Up @@ -1395,6 +1399,16 @@ subroutine cime_init()
call t_stopf('CPL:comp_list_all')

call t_stopf('CPL:init_comps')

if (tprof_in_init) then
! --- Write out performance data
call t_set_prefixf("CPL:init_comps_")
call cime_write_performance_checkpoint(output_perf,&
trim(tchkpt_dir)//'/model_timing'//trim(cpl_inst_tag)//'_init_comps',&
mpicom_GLOID)
call t_unset_prefixf()
endif

!----------------------------------------------------------
!| Determine coupling interactions based on present and prognostic flags
!----------------------------------------------------------
Expand Down Expand Up @@ -2172,6 +2186,15 @@ subroutine cime_init()
call t_adj_detailf(-1)
call t_stopf('CPL:cime_init')

if (tprof_in_init) then
! --- Write out performance data
call t_set_prefixf("CPL:cime_init_")
call cime_write_performance_checkpoint(output_perf,&
trim(tchkpt_dir)//'/model_timing'//trim(cpl_inst_tag)//'_cime_init', &
mpicom_GLOID)
call t_unset_prefixf()
endif

end subroutine cime_init

!===============================================================================
Expand Down Expand Up @@ -3090,30 +3113,14 @@ subroutine cime_run()
if ((tod == 0) .and. in_first_day) then
in_first_day = .false.
endif
call t_adj_detailf(+1)

call t_startf("CPL:sync1_tprof")
call mpi_barrier(mpicom_GLOID,ierr)
call t_stopf("CPL:sync1_tprof")

write(timing_file,'(a,i8.8,a1,i5.5)') &
trim(tchkpt_dir)//"/model_timing"//trim(cpl_inst_tag)//"_",ymd,"_",tod
trim(tchkpt_dir)//"/model_timing"//trim(cpl_inst_tag)//"_",ymd,"_",tod

call t_set_prefixf("CPL:")
if (output_perf) then
call t_prf(filename=trim(timing_file), mpicom=mpicom_GLOID, &
num_outpe=0, output_thispe=output_perf)
else
call t_prf(filename=trim(timing_file), mpicom=mpicom_GLOID, &
num_outpe=0)
endif
call t_set_prefixf("CPL:RUN_LOOP_")
call cime_write_performance_checkpoint(output_perf,timing_file,mpicom_GLOID)
call t_unset_prefixf()

call t_startf("CPL:sync2_tprof")
call mpi_barrier(mpicom_GLOID,ierr)
call t_stopf("CPL:sync2_tprof")

call t_adj_detailf(-1)
endif
call t_stopf ('CPL:TPROF_WRITE')

Expand Down Expand Up @@ -3209,9 +3216,11 @@ subroutine cime_final()
call t_adj_detailf(-1)
call t_stopf ('CPL:FINAL')

call t_startf("sync3_tprof")
call t_set_prefixf("CPL:FINAL_")

call t_startf("sync1_tprf")
call mpi_barrier(mpicom_GLOID,ierr)
call t_stopf("sync3_tprof")
call t_stopf("sync1_tprf")

if (output_perf) then
call t_prf(trim(timing_dir)//'/model_timing'//trim(cpl_inst_tag), &
Expand All @@ -3221,6 +3230,8 @@ subroutine cime_final()
mpicom=mpicom_GLOID)
endif

call t_unset_prefixf()

call t_finalizef()

end subroutine cime_final
Expand Down Expand Up @@ -4245,4 +4256,42 @@ subroutine cime_run_write_restart(drv_pause, write_restart, drv_resume)

end subroutine cime_run_write_restart

!----------------------------------------------------------------------------------

subroutine cime_write_performance_checkpoint(output_ckpt, ckpt_filename, &
ckpt_mpicom)

!----------------------------------------------------------
! Checkpoint performance data
!----------------------------------------------------------

logical, intent(in) :: output_ckpt
character(len=*), intent(in) :: ckpt_filename
integer, intent(in) :: ckpt_mpicom

103 format( 5A )
104 format( A, i10.8, i8)

call t_adj_detailf(+1)

call t_startf("sync1_tprf")
call mpi_barrier(ckpt_mpicom,ierr)
call t_stopf("sync1_tprf")

if (output_ckpt) then
call t_prf(filename=trim(ckpt_filename), mpicom=ckpt_mpicom, &
num_outpe=0, output_thispe=output_ckpt)
else
call t_prf(filename=trim(ckpt_filename), mpicom=ckpt_mpicom, &
num_outpe=0)
endif

call t_startf("sync2_tprf")
call mpi_barrier(ckpt_mpicom,ierr)
call t_stopf("sync2_tprf")

call t_adj_detailf(-1)

end subroutine cime_write_performance_checkpoint

end module cime_comp_mod
15 changes: 15 additions & 0 deletions cime/src/drivers/mct/shr/seq_infodata_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ MODULE seq_infodata_mod
logical :: reprosum_recompute ! recompute reprosum with nonscalable algorithm
! if reprosum_diffmax is exceeded

logical :: tprof_in_init ! performance data checkpoints written during initialization

!--- set via namelist and may be time varying ---
integer(SHR_KIND_IN) :: info_debug ! debug level
logical :: bfbflag ! turn on bfb option
Expand Down Expand Up @@ -401,6 +403,7 @@ SUBROUTINE seq_infodata_Init( infodata, nmlfile, ID, pioid, cpl_tag)
real(SHR_KIND_R8) :: reprosum_diffmax ! maximum difference tolerance
logical :: reprosum_recompute ! recompute reprosum with nonscalable algorithm
! if reprosum_diffmax is exceeded
logical :: tprof_in_init ! performance data checkpoints written during initialization
logical :: mct_usealltoall ! flag for mct alltoall
logical :: mct_usevector ! flag for mct vector
real(shr_kind_r8) :: max_cplstep_time ! abort if cplstep time exceeds this value
Expand Down Expand Up @@ -440,6 +443,7 @@ SUBROUTINE seq_infodata_Init( infodata, nmlfile, ID, pioid, cpl_tag)
eps_oarea, esmf_map_flag, &
reprosum_use_ddpdd, reprosum_allow_infnan, &
reprosum_diffmax, reprosum_recompute, &
tprof_in_init, &
mct_usealltoall, mct_usevector, max_cplstep_time, model_doi_url

!-------------------------------------------------------------------------------
Expand Down Expand Up @@ -549,6 +553,7 @@ SUBROUTINE seq_infodata_Init( infodata, nmlfile, ID, pioid, cpl_tag)
reprosum_allow_infnan = .false.
reprosum_diffmax = -1.0e-8
reprosum_recompute = .false.
tprof_in_init = .false.
mct_usealltoall = .false.
mct_usevector = .false.
max_cplstep_time = 0.0
Expand Down Expand Up @@ -678,6 +683,7 @@ SUBROUTINE seq_infodata_Init( infodata, nmlfile, ID, pioid, cpl_tag)
infodata%reprosum_allow_infnan = reprosum_allow_infnan
infodata%reprosum_diffmax = reprosum_diffmax
infodata%reprosum_recompute = reprosum_recompute
infodata%tprof_in_init = tprof_in_init
infodata%mct_usealltoall = mct_usealltoall
infodata%mct_usevector = mct_usevector

Expand Down Expand Up @@ -960,6 +966,7 @@ SUBROUTINE seq_infodata_GetData_explicit( infodata, cime_model, case_name, case_
eps_agrid, eps_aarea, eps_omask, eps_ogrid, eps_oarea, &
reprosum_use_ddpdd, reprosum_allow_infnan, &
reprosum_diffmax, reprosum_recompute, &
tprof_in_init, &
mct_usealltoall, mct_usevector, max_cplstep_time, model_doi_url, &
glc_valid_input)

Expand Down Expand Up @@ -1067,6 +1074,7 @@ SUBROUTINE seq_infodata_GetData_explicit( infodata, cime_model, case_name, case_
logical, optional, intent(OUT) :: reprosum_allow_infnan ! allow INF and NaN summands
real(SHR_KIND_R8), optional, intent(OUT) :: reprosum_diffmax ! maximum difference tolerance
logical, optional, intent(OUT) :: reprosum_recompute ! recompute if tolerance exceeded
logical, optional, intent(OUT) :: tprof_in_init ! performance checkpoints in init
logical, optional, intent(OUT) :: mct_usealltoall ! flag for mct alltoall
logical, optional, intent(OUT) :: mct_usevector ! flag for mct vector

Expand Down Expand Up @@ -1235,6 +1243,7 @@ SUBROUTINE seq_infodata_GetData_explicit( infodata, cime_model, case_name, case_
if ( present(reprosum_allow_infnan)) reprosum_allow_infnan = infodata%reprosum_allow_infnan
if ( present(reprosum_diffmax) ) reprosum_diffmax = infodata%reprosum_diffmax
if ( present(reprosum_recompute)) reprosum_recompute = infodata%reprosum_recompute
if ( present(tprof_in_init) ) tprof_in_init = infodata%tprof_in_init
if ( present(mct_usealltoall)) mct_usealltoall = infodata%mct_usealltoall
if ( present(mct_usevector) ) mct_usevector = infodata%mct_usevector

Expand Down Expand Up @@ -1466,6 +1475,7 @@ SUBROUTINE seq_infodata_PutData_explicit( infodata, cime_model, case_name, case_
eps_agrid, eps_aarea, eps_omask, eps_ogrid, eps_oarea, &
reprosum_use_ddpdd, reprosum_allow_infnan, &
reprosum_diffmax, reprosum_recompute, &
tprof_in_init, &
mct_usealltoall, mct_usevector, glc_valid_input)


Expand Down Expand Up @@ -1571,6 +1581,7 @@ SUBROUTINE seq_infodata_PutData_explicit( infodata, cime_model, case_name, case_
logical, optional, intent(IN) :: reprosum_allow_infnan ! allow INF and NaN summands
real(SHR_KIND_R8), optional, intent(IN) :: reprosum_diffmax ! maximum difference tolerance
logical, optional, intent(IN) :: reprosum_recompute ! recompute if tolerance exceeded
logical, optional, intent(IN) :: tprof_in_init ! performance checkpoints in init
logical, optional, intent(IN) :: mct_usealltoall ! flag for mct alltoall
logical, optional, intent(IN) :: mct_usevector ! flag for mct vector

Expand Down Expand Up @@ -1737,6 +1748,7 @@ SUBROUTINE seq_infodata_PutData_explicit( infodata, cime_model, case_name, case_
if ( present(reprosum_allow_infnan)) infodata%reprosum_allow_infnan = reprosum_allow_infnan
if ( present(reprosum_diffmax) ) infodata%reprosum_diffmax = reprosum_diffmax
if ( present(reprosum_recompute)) infodata%reprosum_recompute = reprosum_recompute
if ( present(tprof_in_init) ) infodata%tprof_in_init = tprof_in_init
if ( present(mct_usealltoall)) infodata%mct_usealltoall = mct_usealltoall
if ( present(mct_usevector) ) infodata%mct_usevector = mct_usevector

Expand Down Expand Up @@ -2023,6 +2035,7 @@ subroutine seq_infodata_bcast(infodata,mpicom)
call shr_mpi_bcast(infodata%reprosum_allow_infnan, mpicom)
call shr_mpi_bcast(infodata%reprosum_diffmax, mpicom)
call shr_mpi_bcast(infodata%reprosum_recompute, mpicom)
call shr_mpi_bcast(infodata%tprof_in_init, mpicom)
call shr_mpi_bcast(infodata%mct_usealltoall, mpicom)
call shr_mpi_bcast(infodata%mct_usevector, mpicom)

Expand Down Expand Up @@ -2684,6 +2697,8 @@ SUBROUTINE seq_infodata_print( infodata )
write(logunit,F0R) subname,'reprosum_diffmax = ', infodata%reprosum_diffmax
write(logunit,F0L) subname,'reprosum_recompute = ', infodata%reprosum_recompute

write(logunit,F0L) subname,'tprof_in_init = ', infodata%tprof_in_init

write(logunit,F0L) subname,'mct_usealltoall = ', infodata%mct_usealltoall
write(logunit,F0L) subname,'mct_usevector = ', infodata%mct_usevector

Expand Down

0 comments on commit 7b1e744

Please sign in to comment.