From 3c3bd602542945a7d12f857bbe832d18dc444612 Mon Sep 17 00:00:00 2001
From: Philippe Blain <philippe.blain@canada.ca>
Date: Tue, 12 Jul 2022 10:16:56 -0400
Subject: [PATCH] ice_dyn_vp: use 'global_sum_prod' for bit-for-bit output
 reproducibility

The VP solver uses a linear solver, FGMRES, as part of the non-linear
iteration. The FGMRES algorithm involves computing the norm of a
distributed vector field, thus performing global sums.

These norms are computed by first summing the squared X and Y components
of a vector field in subroutine 'calc_L2norm_squared', summing these
over the local blocks, and then doing a global (MPI) sum using
'global_sum'.

This approach does not lead to reproducible results when the MPI
distribution, or the number of local blocks, is changed, for reasons
explained in the "Reproducible sums" section of the Developer Guide
(mostly, floating point addition is not associative). This was partly
pointed out in [1] but I failed to realize it at the time.

Make the results of the VP solver more reproducible by using two calls
to 'global_sum_prod' to individually compute the squares of the X and Y
components when computing norms, and then summing these two reproducible
scalars.

The same pattern appears in the FGMRES solver (subroutine 'fgmres'), the
preconditioner 'pgmres' which uses the same algorithm, and the
Classical and Modified Gram-Schmidt algorithms in 'orthogonalize'.

These changes result in twice the number of global sums for fgmres,
pgmres and the MGS algorithm. For the CGS algorithm, the performance
impact is higher as 'global_sum_prod' is called inside the loop, whereas
previously we called 'global_allreduce_sum' after the loop to compute
all 'initer' sums at the same time.

To keep that optimization, we would have to implement a new interface
'global_allreduce_sum_prod' which would take two arrays of shape
(nx_block,ny_block,max_blocks,k) and sum these over their first three
dimensions before performing the global reduction over the k dimension.

We choose to not go that route for now mostly because anyway the CGS
algorithm is (by default) only used for the PGMRES preconditioner, and
so the cost should be relatively low as 'initer' corresponds to
'dim_pgmres' in the namelist, which should be kept low for efficiency
(default 5).

These changes lead to bit-for-bit reproducibility (the decomp_suite
passes) when using 'precond=ident' and 'precond=diag'. 'precond=pgmres'
is still not bit-for-bit because some halo updates are skipped for
efficiency. This will be addressed in a following commit.

[1] https://github.com/CICE-Consortium/CICE/pull/491#discussion_r460147629
---
 cicecore/cicedynB/dynamics/ice_dyn_vp.F90 | 109 +++++-----------------
 1 file changed, 25 insertions(+), 84 deletions(-)

diff --git a/cicecore/cicedynB/dynamics/ice_dyn_vp.F90 b/cicecore/cicedynB/dynamics/ice_dyn_vp.F90
index 24421a91f..ba008538e 100644
--- a/cicecore/cicedynB/dynamics/ice_dyn_vp.F90
+++ b/cicecore/cicedynB/dynamics/ice_dyn_vp.F90
@@ -51,7 +51,7 @@ module ice_dyn_vp
           seabed_stress, Ktens, stack_fields,  unstack_fields
       use ice_fileunits, only: nu_diag
       use ice_flux, only: fm
-      use ice_global_reductions, only: global_sum, global_allreduce_sum
+      use ice_global_reductions, only: global_sum, global_sum_prod
       use ice_grid, only: dxT, dyT, dxhy, dyhx, cxp, cyp, cxm, cym, uarear
       use ice_exit, only: abort_ice
       use icepack_intfc, only: icepack_warnings_flush, icepack_warnings_aborted
@@ -2847,18 +2847,11 @@ subroutine fgmres (zetax2   , etax2   , &
       ! Start outer (restarts) loop
       do
          ! Compute norm of initial residual
-         !$OMP PARALLEL DO PRIVATE(iblk)
-         do iblk = 1, nblocks
-            call calc_L2norm_squared(nx_block       , ny_block      , &
-                                     icellu   (iblk),                 &
-                                     indxui (:,iblk), indxuj(:,iblk), &
-                                     arnoldi_basis_x(:,:,iblk, 1)   , &
-                                     arnoldi_basis_y(:,:,iblk, 1)   , &
-                                     norm_squared(iblk))
 
-         enddo
-         !$OMP END PARALLEL DO
-         norm_residual = sqrt(global_sum(sum(norm_squared), distrb_info))
+         norm_residual = sqrt(global_sum_prod(arnoldi_basis_x(:,:,:,1), &
+                                              arnoldi_basis_x(:,:,:,1), distrb_info, field_loc_NEcorner) + &
+                              global_sum_prod(arnoldi_basis_y(:,:,:,1), &
+                                              arnoldi_basis_y(:,:,:,1), distrb_info, field_loc_NEcorner))
 
          if (my_task == master_task .and. monitor_fgmres) then
             write(nu_diag, '(a,i4,a,d26.16)') "monitor_fgmres: iter_fgmres= ", nbiter, &
@@ -2952,17 +2945,10 @@ subroutine fgmres (zetax2   , etax2   , &
                                hessenberg)
 
             ! Compute norm of new Arnoldi vector and update Hessenberg matrix
-            !$OMP PARALLEL DO PRIVATE(iblk)
-            do iblk = 1, nblocks
-               call calc_L2norm_squared(nx_block      ,  ny_block        , &
-                                        icellu   (iblk),                   &
-                                        indxui (:,iblk), indxuj(:, iblk) , &
-                                        arnoldi_basis_x(:,:,iblk, nextit), &
-                                        arnoldi_basis_y(:,:,iblk, nextit), &
-                                        norm_squared(iblk))
-            enddo
-            !$OMP END PARALLEL DO
-            hessenberg(nextit,initer) = sqrt(global_sum(sum(norm_squared), distrb_info))
+            hessenberg(nextit,initer) = sqrt(global_sum_prod(arnoldi_basis_x(:,:,:,nextit), &
+                                                             arnoldi_basis_x(:,:,:,nextit), distrb_info, field_loc_NEcorner) + &
+                                             global_sum_prod(arnoldi_basis_y(:,:,:,nextit), &
+                                                             arnoldi_basis_y(:,:,:,nextit), distrb_info, field_loc_NEcorner))
 
             ! Watch out for happy breakdown
             if (.not. almost_zero( hessenberg(nextit,initer) ) ) then
@@ -3240,18 +3226,10 @@ subroutine pgmres (zetax2   , etax2   , &
       ! Start outer (restarts) loop
       do
          ! Compute norm of initial residual
-         !$OMP PARALLEL DO PRIVATE(iblk)
-         do iblk = 1, nblocks
-            call calc_L2norm_squared(nx_block       , ny_block       , &
-                                     icellu   (iblk),                  &
-                                     indxui (:,iblk), indxuj(:, iblk), &
-                                     arnoldi_basis_x(:,:,iblk, 1),     &
-                                     arnoldi_basis_y(:,:,iblk, 1),     &
-                                     norm_squared(iblk))
-
-         enddo
-         !$OMP END PARALLEL DO
-         norm_residual = sqrt(global_sum(sum(norm_squared), distrb_info))
+         norm_residual = sqrt(global_sum_prod(arnoldi_basis_x(:,:,:,1), &
+                                              arnoldi_basis_x(:,:,:,1), distrb_info, field_loc_NEcorner) + &
+                              global_sum_prod(arnoldi_basis_y(:,:,:,1), &
+                                              arnoldi_basis_y(:,:,:,1), distrb_info, field_loc_NEcorner))
 
          if (my_task == master_task .and. monitor_pgmres) then
             write(nu_diag, '(a,i4,a,d26.16)') "monitor_pgmres: iter_pgmres= ", nbiter, &
@@ -3334,17 +3312,10 @@ subroutine pgmres (zetax2   , etax2   , &
                                hessenberg)
 
             ! Compute norm of new Arnoldi vector and update Hessenberg matrix
-            !$OMP PARALLEL DO PRIVATE(iblk)
-            do iblk = 1, nblocks
-               call calc_L2norm_squared(nx_block       , ny_block        , &
-                                        icellu   (iblk),                   &
-                                        indxui (:,iblk), indxuj(:, iblk) , &
-                                        arnoldi_basis_x(:,:,iblk, nextit), &
-                                        arnoldi_basis_y(:,:,iblk, nextit), &
-                                        norm_squared(iblk))
-            enddo
-            !$OMP END PARALLEL DO
-            hessenberg(nextit,initer) = sqrt(global_sum(sum(norm_squared), distrb_info))
+            hessenberg(nextit,initer) = sqrt(global_sum_prod(arnoldi_basis_x(:,:,:,nextit), &
+                                                             arnoldi_basis_x(:,:,:,nextit), distrb_info, field_loc_NEcorner) + &
+                                             global_sum_prod(arnoldi_basis_y(:,:,:,nextit), &
+                                                             arnoldi_basis_y(:,:,:,nextit), distrb_info, field_loc_NEcorner))
 
             ! Watch out for happy breakdown
             if (.not. almost_zero( hessenberg(nextit,initer) ) ) then
@@ -3623,39 +3594,20 @@ subroutine orthogonalize(ortho_type     , initer         , &
          ij      , & ! compressed index
          i, j        ! grid indices
 
-      real (kind=dbl_kind), dimension (max_blocks) :: &
-         local_dot      ! local array value to accumulate dot product of grid function over blocks
-
-      real (kind=dbl_kind), dimension(maxinner) :: &
-         dotprod_local  ! local array to accumulate several dot product computations
-
       character(len=*), parameter :: subname = '(orthogonalize)'
 
       if (trim(ortho_type) == 'cgs') then ! Classical Gram-Schmidt
          ! Classical Gram-Schmidt orthogonalisation process
          ! First loop of Gram-Schmidt (compute coefficients)
-         dotprod_local = c0
          do it = 1, initer
-            local_dot = c0
 
-            !$OMP PARALLEL DO PRIVATE(iblk,ij,i,j)
-            do iblk = 1, nblocks
-               do ij = 1, icellu(iblk)
-                  i = indxui(ij, iblk)
-                  j = indxuj(ij, iblk)
+            hessenberg(it, initer) = global_sum_prod(arnoldi_basis_x(:,:,:,it), &
+                                                     arnoldi_basis_x(:,:,:,nextit), distrb_info, field_loc_NEcorner) + &
+                                     global_sum_prod(arnoldi_basis_y(:,:,:,it), &
+                                                     arnoldi_basis_y(:,:,:,nextit), distrb_info, field_loc_NEcorner)
 
-                  local_dot(iblk) = local_dot(iblk) + &
-                                    (arnoldi_basis_x(i, j, iblk, it) * arnoldi_basis_x(i, j, iblk, nextit)) + &
-                                    (arnoldi_basis_y(i, j, iblk, it) * arnoldi_basis_y(i, j, iblk, nextit))
-               enddo ! ij
-            enddo
-            !$OMP END PARALLEL DO
-
-            dotprod_local(it) = sum(local_dot)
          end do
 
-         hessenberg(1:initer, initer) = global_allreduce_sum(dotprod_local(1:initer), distrb_info)
-
          ! Second loop of Gram-Schmidt (orthonormalize)
          do it = 1, initer
             !$OMP PARALLEL DO PRIVATE(iblk,ij,i,j)
@@ -3675,22 +3627,11 @@ subroutine orthogonalize(ortho_type     , initer         , &
       elseif (trim(ortho_type) == 'mgs') then ! Modified Gram-Schmidt
          ! Modified Gram-Schmidt orthogonalisation process
          do it = 1, initer
-            local_dot = c0
-
-            !$OMP PARALLEL DO PRIVATE(iblk,ij,i,j)
-            do iblk = 1, nblocks
-               do ij = 1, icellu(iblk)
-                  i = indxui(ij, iblk)
-                  j = indxuj(ij, iblk)
-
-                  local_dot(iblk) = local_dot(iblk) + &
-                                    (arnoldi_basis_x(i, j, iblk, it) * arnoldi_basis_x(i, j, iblk, nextit)) + &
-                                    (arnoldi_basis_y(i, j, iblk, it) * arnoldi_basis_y(i, j, iblk, nextit))
-               enddo ! ij
-            enddo
-            !$OMP END PARALLEL DO
 
-            hessenberg(it,initer) = global_sum(sum(local_dot), distrb_info)
+            hessenberg(it, initer) = global_sum_prod(arnoldi_basis_x(:,:,:,it), &
+                                                     arnoldi_basis_x(:,:,:,nextit), distrb_info, field_loc_NEcorner) + &
+                                     global_sum_prod(arnoldi_basis_y(:,:,:,it), &
+                                                     arnoldi_basis_y(:,:,:,nextit), distrb_info, field_loc_NEcorner)
 
             !$OMP PARALLEL DO PRIVATE(iblk,ij,i,j)
             do iblk = 1, nblocks