From bfcfece4388c3e964b2e86679eb14cf577dc9fbd Mon Sep 17 00:00:00 2001 From: Hyun-Gyu Kang Date: Tue, 19 Apr 2022 11:18:14 -0400 Subject: [PATCH] Change a reproducible sum algorithm in SI solver Change a reproducible sum algorithm in the ocean split-implicit solver. The changed algorithm uses a global reproducible summation module in components/mpas-framework/src/framework/mpas_global_sum_mod.F --- .../mpas_ocn_time_integration_si.F | 487 ++++++++++-------- 1 file changed, 273 insertions(+), 214 deletions(-) diff --git a/components/mpas-ocean/src/mode_forward/mpas_ocn_time_integration_si.F b/components/mpas-ocean/src/mode_forward/mpas_ocn_time_integration_si.F index b8034ab24517..9f5a4c170f19 100644 --- a/components/mpas-ocean/src/mode_forward/mpas_ocn_time_integration_si.F +++ b/components/mpas-ocean/src/mode_forward/mpas_ocn_time_integration_si.F @@ -35,6 +35,7 @@ module ocn_time_integration_si use mpas_threading use mpas_timekeeping use mpas_log + use mpas_global_sum_mod use ocn_config use ocn_mesh @@ -297,6 +298,11 @@ subroutine ocn_time_integrator_si(domain, dt)!{{{ SIcst_allreduce_temp9 ! temp for global summations integer , dimension(9) :: & SIcst_allreduce_itemp9 ! temp for partition match mode + real (kind=RKIND), dimension(:,:), allocatable :: & + globalReprodSum2fld1, & ! array for global reproducible sum + globalReprodSum2fld2, & ! array for global reproducible sum + globalReprodSum9fld1, & ! array for global reproducible sum + globalReprodSum9fld2 ! array for global reproducible sum real (kind=RKIND) :: & ! temp scalars for the SI method SIcst_q0y0 , SIcst_y0y0 , SIcst_q0q0 , & @@ -831,6 +837,13 @@ subroutine ocn_time_integrator_si(domain, dt)!{{{ !$omp end parallel endif + if ( config_btr_si_partition_match_mode ) then + allocate( globalReprodSum2fld1(nCellsOwned,2), & + globalReprodSum2fld2(nCellsOwned,2), & + globalReprodSum9fld1(nCellsOwned,9), & + globalReprodSum9fld2(nCellsOwned,9) ) + endif + !-------------------------------------------------------------! ! BEGIN Large barotropic system iteration loop !-------------------------------------------------------------! @@ -1159,42 +1172,46 @@ subroutine ocn_time_integrator_si(domain, dt)!{{{ !$omp end parallel ! Reduction --------------------------------------------------! - SIcst_r00r0 = 0.0_RKIND - SIcst_r00w0 = 0.0_RKIND - - do iCell = 1, nCellsOwned - SIcst_r00r0 = SIcst_r00r0 + SIvec_r00(iCell) & - * SIvec_r0(iCell) - SIcst_r00w0 = SIcst_r00w0 + SIvec_r00(iCell) & - * SIvec_w0(iCell) - end do ! iCell + + if ( config_btr_si_partition_match_mode ) then + + ! Reproducible sum of multiple fields over products + + do iCell = 1,nCellsOwned + globalReprodSum2fld1(iCell,1) = SIvec_r00(iCell) + globalReprodSum2fld1(iCell,2) = SIvec_r00(iCell) + + globalReprodSum2fld2(iCell,1) = SIvec_r0(iCell) + globalReprodSum2fld2(iCell,2) = SIvec_w0(iCell) + end do + + SIcst_allreduce_global2(:) = & + mpas_global_sum_nfld(globalReprodSum2fld1, & + globalReprodSum2fld2, & + domain%dminfo%comm) + + else + + SIcst_r00r0 = 0.0_RKIND + SIcst_r00w0 = 0.0_RKIND + + do iCell = 1, nCellsOwned + SIcst_r00r0 = SIcst_r00r0 + SIvec_r00(iCell) & + * SIvec_r0(iCell) + SIcst_r00w0 = SIcst_r00w0 + SIvec_r00(iCell) & + * SIvec_w0(iCell) + end do ! iCell + + SIcst_allreduce_local2(1) = SIcst_r00r0 + SIcst_allreduce_local2(2) = SIcst_r00w0 + + ! Global sum across CPUs + call mpas_timer_start("si reduction r0") + call mpas_dmpar_sum_real_array(domain % dminfo, 2, & + SIcst_allreduce_local2, & + SIcst_allreduce_global2) + call mpas_timer_stop ("si reduction r0") - SIcst_allreduce_local2(1) = SIcst_r00r0 - SIcst_allreduce_local2(2) = SIcst_r00w0 - - ! Global sum across CPUs - call mpas_timer_start("si reduction r0") - call mpas_dmpar_sum_real_array(domain % dminfo, 2, & - SIcst_allreduce_local2, & - SIcst_allreduce_global2) - call mpas_timer_stop ("si reduction r0") - - if ( config_btr_si_partition_match_mode .and. ncpus > 1) then - SIcst_allreduce_temp9(:) = 0.0_RKIND - SIcst_allreduce_itemp9(:) = 0.0_RKIND - - SIcst_allreduce_itemp9(1:2) = & - exponent(SIcst_allreduce_global2(:)) - SIcst_allreduce_temp9(1:2) = & - fraction(SIcst_allreduce_global2(:)) - SIcst_allreduce_temp9(1:2) = & - anint(SIcst_allreduce_temp9(1:2) & - * 1.0e+4_RKIND ) & - / 1.0e+4_RKIND - - SIcst_allreduce_global2(:) = & - SIcst_allreduce_temp9(1:2) & - * 2.0_RKIND ** (SIcst_allreduce_itemp9(1:2)) endif SIcst_r00r0_global = SIcst_allreduce_global2(1) @@ -1338,78 +1355,95 @@ subroutine ocn_time_integrator_si(domain, dt)!{{{ !$omp end parallel ! Reduction -----------------------------------------------! - SIcst_r00s0 = 0.0_RKIND - SIcst_r00z0 = 0.0_RKIND - SIcst_q0y0 = 0.0_RKIND - SIcst_y0y0 = 0.0_RKIND - SIcst_r00q0 = 0.0_RKIND - SIcst_r00y0 = 0.0_RKIND - SIcst_r00t0 = 0.0_RKIND - SIcst_r00v0 = 0.0_RKIND - SIcst_q0q0 = 0.0_RKIND - - do iCell = 1,nCellsOwned - SIcst_r00s0 = SIcst_r00s0 + SIvec_r00(iCell) & - * SIvec_s1(iCell) ! s1 - - SIcst_r00z0 = SIcst_r00z0 + SIvec_r00(iCell) & - * SIvec_z1(iCell) ! z1 - - SIcst_q0y0 = SIcst_q0y0 + SIvec_q0(iCell) & - * SIvec_y0(iCell) - - SIcst_y0y0 = SIcst_y0y0 + SIvec_y0(iCell) & - * SIvec_y0(iCell) - - SIcst_r00q0 = SIcst_r00q0 + SIvec_r00(iCell) & - * SIvec_q0(iCell) - - SIcst_r00y0 = SIcst_r00y0 + SIvec_r00(iCell) & - * SIvec_y0(iCell) - - SIcst_r00t0 = SIcst_r00t0 + SIvec_r00(iCell) & - * SIvec_t0(iCell) + if ( config_btr_si_partition_match_mode ) then + + ! Reproducible sum of multiple fields over products + + do iCell = 1,nCellsOwned + globalReprodSum9fld1(iCell,1) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,2) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,3) = SIvec_q0(iCell) + globalReprodSum9fld1(iCell,4) = SIvec_y0(iCell) + globalReprodSum9fld1(iCell,5) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,6) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,7) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,8) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,9) = SIvec_q0(iCell) + + globalReprodSum9fld2(iCell,1) = SIvec_s1(iCell) + globalReprodSum9fld2(iCell,2) = SIvec_z1(iCell) + globalReprodSum9fld2(iCell,3) = SIvec_y0(iCell) + globalReprodSum9fld2(iCell,4) = SIvec_y0(iCell) + globalReprodSum9fld2(iCell,5) = SIvec_q0(iCell) + globalReprodSum9fld2(iCell,6) = SIvec_y0(iCell) + globalReprodSum9fld2(iCell,7) = SIvec_t0(iCell) + globalReprodSum9fld2(iCell,8) = SIvec_v0(iCell) + globalReprodSum9fld2(iCell,9) = SIvec_q0(iCell) + end do + + SIcst_allreduce_global9(:) = & + mpas_global_sum_nfld(globalReprodSum9fld1, & + globalReprodSum9fld2, & + domain%dminfo%comm) + + else + + SIcst_r00s0 = 0.0_RKIND + SIcst_r00z0 = 0.0_RKIND + SIcst_q0y0 = 0.0_RKIND + SIcst_y0y0 = 0.0_RKIND + SIcst_r00q0 = 0.0_RKIND + SIcst_r00y0 = 0.0_RKIND + SIcst_r00t0 = 0.0_RKIND + SIcst_r00v0 = 0.0_RKIND + SIcst_q0q0 = 0.0_RKIND + + do iCell = 1,nCellsOwned + SIcst_r00s0 = SIcst_r00s0 + SIvec_r00(iCell) & + * SIvec_s1(iCell) ! s1 + + SIcst_r00z0 = SIcst_r00z0 + SIvec_r00(iCell) & + * SIvec_z1(iCell) ! z1 + + SIcst_q0y0 = SIcst_q0y0 + SIvec_q0(iCell) & + * SIvec_y0(iCell) + + SIcst_y0y0 = SIcst_y0y0 + SIvec_y0(iCell) & + * SIvec_y0(iCell) + + SIcst_r00q0 = SIcst_r00q0 + SIvec_r00(iCell) & + * SIvec_q0(iCell) + + SIcst_r00y0 = SIcst_r00y0 + SIvec_r00(iCell) & + * SIvec_y0(iCell) + + SIcst_r00t0 = SIcst_r00t0 + SIvec_r00(iCell) & + * SIvec_t0(iCell) + + SIcst_r00v0 = SIcst_r00v0 + SIvec_r00(iCell) & + * SIvec_v0(iCell) + + SIcst_q0q0 = SIcst_q0q0 + SIvec_q0(iCell) & + * SIvec_q0(iCell) + end do + + SIcst_allreduce_local9(1) = SIcst_r00s0 + SIcst_allreduce_local9(2) = SIcst_r00z0 + SIcst_allreduce_local9(3) = SIcst_q0y0 + SIcst_allreduce_local9(4) = SIcst_y0y0 + SIcst_allreduce_local9(5) = SIcst_r00q0 + SIcst_allreduce_local9(6) = SIcst_r00y0 + SIcst_allreduce_local9(7) = SIcst_r00t0 + SIcst_allreduce_local9(8) = SIcst_r00v0 + SIcst_allreduce_local9(9) = SIcst_q0q0 + + ! Global sum across CPUs + call mpas_timer_start("si reduction iter") + call mpas_dmpar_sum_real_array(domain % dminfo, 9, & + SIcst_allreduce_local9, & + SIcst_allreduce_global9) + call mpas_timer_stop("si reduction iter") - SIcst_r00v0 = SIcst_r00v0 + SIvec_r00(iCell) & - * SIvec_v0(iCell) - - SIcst_q0q0 = SIcst_q0q0 + SIvec_q0(iCell) & - * SIvec_q0(iCell) - end do - - SIcst_allreduce_local9(1) = SIcst_r00s0 - SIcst_allreduce_local9(2) = SIcst_r00z0 - SIcst_allreduce_local9(3) = SIcst_q0y0 - SIcst_allreduce_local9(4) = SIcst_y0y0 - SIcst_allreduce_local9(5) = SIcst_r00q0 - SIcst_allreduce_local9(6) = SIcst_r00y0 - SIcst_allreduce_local9(7) = SIcst_r00t0 - SIcst_allreduce_local9(8) = SIcst_r00v0 - SIcst_allreduce_local9(9) = SIcst_q0q0 - - ! Global sum across CPUs - call mpas_timer_start("si reduction iter") - call mpas_dmpar_sum_real_array(domain % dminfo, 9, & - SIcst_allreduce_local9, & - SIcst_allreduce_global9) - call mpas_timer_stop("si reduction iter") - - if ( config_btr_si_partition_match_mode .and. ncpus>1) then - SIcst_allreduce_temp9(:) = 0.0_RKIND - SIcst_allreduce_itemp9(:) = 0.0_RKIND - - SIcst_allreduce_itemp9(:) = & - exponent(SIcst_allreduce_global9(:)) - SIcst_allreduce_temp9(:) = & - fraction(SIcst_allreduce_global9(:)) - SIcst_allreduce_temp9(:) = & - anint( SIcst_allreduce_temp9(:) & - * 1.0e+4_RKIND ) & - / 1.0e+4_RKIND - - SIcst_allreduce_global9(:) = & - SIcst_allreduce_temp9(:) & - * 2.0_RKIND ** (SIcst_allreduce_itemp9(:)) endif SIcst_r00s0_global = SIcst_allreduce_global9(1) @@ -1936,43 +1970,45 @@ subroutine ocn_time_integrator_si(domain, dt)!{{{ ! Reduction -----------------------------------------------! - SIcst_r00r0 = 0.0_RKIND - SIcst_r00w0 = 0.0_RKIND - - do iCell = 1, nCellsOwned - SIcst_r00r0 = SIcst_r00r0 + SIvec_r00(iCell) & - * SIvec_r0(iCell) - SIcst_r00w0 = SIcst_r00w0 + SIvec_r00(iCell) & - * SIvec_w0(iCell) - end do ! iCell + if ( config_btr_si_partition_match_mode ) then + + ! Reproducible sum of multiple fields over products + + do iCell = 1,nCellsOwned + globalReprodSum2fld1(iCell,1) = SIvec_r00(iCell) + globalReprodSum2fld1(iCell,2) = SIvec_r00(iCell) + + globalReprodSum2fld2(iCell,1) = SIvec_r0(iCell) + globalReprodSum2fld2(iCell,2) = SIvec_w0(iCell) + end do + + SIcst_allreduce_global2(:) = & + mpas_global_sum_nfld(globalReprodSum2fld1, & + globalReprodSum2fld2, & + domain%dminfo%comm) + + else + + SIcst_r00r0 = 0.0_RKIND + SIcst_r00w0 = 0.0_RKIND + + do iCell = 1, nCellsOwned + SIcst_r00r0 = SIcst_r00r0 + SIvec_r00(iCell) & + * SIvec_r0(iCell) + SIcst_r00w0 = SIcst_r00w0 + SIvec_r00(iCell) & + * SIvec_w0(iCell) + end do ! iCell + + SIcst_allreduce_local2(1) = SIcst_r00r0 + SIcst_allreduce_local2(2) = SIcst_r00w0 + + ! Global sum across CPUs + call mpas_timer_start("si reduction r0") + call mpas_dmpar_sum_real_array(domain % dminfo, 2, & + SIcst_allreduce_local2, & + SIcst_allreduce_global2) + call mpas_timer_stop ("si reduction r0") - SIcst_allreduce_local2(1) = SIcst_r00r0 - SIcst_allreduce_local2(2) = SIcst_r00w0 - - ! Global sum across CPUs - call mpas_timer_start("si reduction r0") - call mpas_dmpar_sum_real_array(domain % dminfo, 2, & - SIcst_allreduce_local2, & - SIcst_allreduce_global2) - call mpas_timer_stop ("si reduction r0") - - - if ( config_btr_si_partition_match_mode .and. ncpus > 1) then - SIcst_allreduce_temp9(:) = 0.0_RKIND - SIcst_allreduce_itemp9(:) = 0.0_RKIND - - SIcst_allreduce_itemp9(1:2) = & - exponent(SIcst_allreduce_global2(:)) - SIcst_allreduce_temp9(1:2) = & - fraction(SIcst_allreduce_global2(:)) - SIcst_allreduce_temp9(1:2) = & - anint(SIcst_allreduce_temp9(1:2) & - * 1.0e+4_RKIND ) & - / 1.0e+4_RKIND - - SIcst_allreduce_global2(:) = & - SIcst_allreduce_temp9(1:2) & - * 2.0_RKIND ** (SIcst_allreduce_itemp9(1:2)) endif @@ -2115,78 +2151,96 @@ subroutine ocn_time_integrator_si(domain, dt)!{{{ !$omp end parallel ! Reduction -----------------------------------------------! - SIcst_r00s0 = 0.0_RKIND - SIcst_r00z0 = 0.0_RKIND - SIcst_q0y0 = 0.0_RKIND - SIcst_y0y0 = 0.0_RKIND - SIcst_r00q0 = 0.0_RKIND - SIcst_r00y0 = 0.0_RKIND - SIcst_r00t0 = 0.0_RKIND - SIcst_r00v0 = 0.0_RKIND - SIcst_q0q0 = 0.0_RKIND - - do iCell = 1,nCellsOwned - SIcst_r00s0 = SIcst_r00s0 + SIvec_r00(iCell) & - * SIvec_s1(iCell) ! s1 - - SIcst_r00z0 = SIcst_r00z0 + SIvec_r00(iCell) & - * SIvec_z1(iCell) ! z1 - - SIcst_q0y0 = SIcst_q0y0 + SIvec_q0(iCell) & - * SIvec_y0(iCell) - - SIcst_y0y0 = SIcst_y0y0 + SIvec_y0(iCell) & - * SIvec_y0(iCell) - - SIcst_r00q0 = SIcst_r00q0 + SIvec_r00(iCell) & - * SIvec_q0(iCell) - - SIcst_r00y0 = SIcst_r00y0 + SIvec_r00(iCell) & - * SIvec_y0(iCell) - - SIcst_r00t0 = SIcst_r00t0 + SIvec_r00(iCell) & - * SIvec_t0(iCell) - - SIcst_r00v0 = SIcst_r00v0 + SIvec_r00(iCell) & - * SIvec_v0(iCell) - - SIcst_q0q0 = SIcst_q0q0 + SIvec_q0(iCell) & - * SIvec_q0(iCell) - end do - - SIcst_allreduce_local9(1) = SIcst_r00s0 - SIcst_allreduce_local9(2) = SIcst_r00z0 - SIcst_allreduce_local9(3) = SIcst_q0y0 - SIcst_allreduce_local9(4) = SIcst_y0y0 - SIcst_allreduce_local9(5) = SIcst_r00q0 - SIcst_allreduce_local9(6) = SIcst_r00y0 - SIcst_allreduce_local9(7) = SIcst_r00t0 - SIcst_allreduce_local9(8) = SIcst_r00v0 - SIcst_allreduce_local9(9) = SIcst_q0q0 - - ! Global sum across CPUs - call mpas_timer_start("si reduction iter") - call mpas_dmpar_sum_real_array(domain % dminfo, 9, & - SIcst_allreduce_local9, & - SIcst_allreduce_global9) - call mpas_timer_stop("si reduction iter") - - if ( config_btr_si_partition_match_mode .and. ncpus>1) then - SIcst_allreduce_temp9(:) = 0.0_RKIND - SIcst_allreduce_itemp9(:) = 0.0_RKIND - - SIcst_allreduce_itemp9(:) = & - exponent(SIcst_allreduce_global9(:)) - SIcst_allreduce_temp9(:) = & - fraction(SIcst_allreduce_global9(:)) - SIcst_allreduce_temp9(:) = & - anint( SIcst_allreduce_temp9(:) & - * 1.0e+4_RKIND ) & - / 1.0e+4_RKIND - - SIcst_allreduce_global9(:) = & - SIcst_allreduce_temp9(:) & - * 2.0_RKIND ** (SIcst_allreduce_itemp9(:)) + + if ( config_btr_si_partition_match_mode ) then + + ! Reproducible sum of multiple fields over products + + do iCell = 1,nCellsOwned + globalReprodSum9fld1(iCell,1) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,2) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,3) = SIvec_q0(iCell) + globalReprodSum9fld1(iCell,4) = SIvec_y0(iCell) + globalReprodSum9fld1(iCell,5) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,6) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,7) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,8) = SIvec_r00(iCell) + globalReprodSum9fld1(iCell,9) = SIvec_q0(iCell) + + globalReprodSum9fld2(iCell,1) = SIvec_s1(iCell) + globalReprodSum9fld2(iCell,2) = SIvec_z1(iCell) + globalReprodSum9fld2(iCell,3) = SIvec_y0(iCell) + globalReprodSum9fld2(iCell,4) = SIvec_y0(iCell) + globalReprodSum9fld2(iCell,5) = SIvec_q0(iCell) + globalReprodSum9fld2(iCell,6) = SIvec_y0(iCell) + globalReprodSum9fld2(iCell,7) = SIvec_t0(iCell) + globalReprodSum9fld2(iCell,8) = SIvec_v0(iCell) + globalReprodSum9fld2(iCell,9) = SIvec_q0(iCell) + end do + + SIcst_allreduce_global9(:) = & + mpas_global_sum_nfld(globalReprodSum9fld1, & + globalReprodSum9fld2, & + domain%dminfo%comm) + + else + + SIcst_r00s0 = 0.0_RKIND + SIcst_r00z0 = 0.0_RKIND + SIcst_q0y0 = 0.0_RKIND + SIcst_y0y0 = 0.0_RKIND + SIcst_r00q0 = 0.0_RKIND + SIcst_r00y0 = 0.0_RKIND + SIcst_r00t0 = 0.0_RKIND + SIcst_r00v0 = 0.0_RKIND + SIcst_q0q0 = 0.0_RKIND + + do iCell = 1,nCellsOwned + SIcst_r00s0 = SIcst_r00s0 + SIvec_r00(iCell) & + * SIvec_s1(iCell) ! s1 + + SIcst_r00z0 = SIcst_r00z0 + SIvec_r00(iCell) & + * SIvec_z1(iCell) ! z1 + + SIcst_q0y0 = SIcst_q0y0 + SIvec_q0(iCell) & + * SIvec_y0(iCell) + + SIcst_y0y0 = SIcst_y0y0 + SIvec_y0(iCell) & + * SIvec_y0(iCell) + + SIcst_r00q0 = SIcst_r00q0 + SIvec_r00(iCell) & + * SIvec_q0(iCell) + + SIcst_r00y0 = SIcst_r00y0 + SIvec_r00(iCell) & + * SIvec_y0(iCell) + + SIcst_r00t0 = SIcst_r00t0 + SIvec_r00(iCell) & + * SIvec_t0(iCell) + + SIcst_r00v0 = SIcst_r00v0 + SIvec_r00(iCell) & + * SIvec_v0(iCell) + + SIcst_q0q0 = SIcst_q0q0 + SIvec_q0(iCell) & + * SIvec_q0(iCell) + end do + + SIcst_allreduce_local9(1) = SIcst_r00s0 + SIcst_allreduce_local9(2) = SIcst_r00z0 + SIcst_allreduce_local9(3) = SIcst_q0y0 + SIcst_allreduce_local9(4) = SIcst_y0y0 + SIcst_allreduce_local9(5) = SIcst_r00q0 + SIcst_allreduce_local9(6) = SIcst_r00y0 + SIcst_allreduce_local9(7) = SIcst_r00t0 + SIcst_allreduce_local9(8) = SIcst_r00v0 + SIcst_allreduce_local9(9) = SIcst_q0q0 + + ! Global sum across CPUs + call mpas_timer_start("si reduction iter") + call mpas_dmpar_sum_real_array(domain % dminfo, 9, & + SIcst_allreduce_local9, & + SIcst_allreduce_global9) + call mpas_timer_stop("si reduction iter") + endif SIcst_r00s0_global = SIcst_allreduce_global9(1) @@ -2497,6 +2551,11 @@ subroutine ocn_time_integrator_si(domain, dt)!{{{ ! END Large barotropic system iteration loop !-------------------------------------------------------------! + if ( config_btr_si_partition_match_mode ) then + deallocate( globalReprodSum2fld1,globalReprodSum2fld2, & + globalReprodSum9fld1,globalReprodSum9fld2 ) + endif + ! Check that you can compute SSH using the total sum or the ! individual increments over the barotropic subcycles. ! efficiency: This next block of code is really a check for