diff --git a/.github/workflows/build_ubuntu_gnu.yml b/.github/workflows/build_ubuntu_gnu.yml
index f283426098..3fea79464d 100644
--- a/.github/workflows/build_ubuntu_gnu.yml
+++ b/.github/workflows/build_ubuntu_gnu.yml
@@ -7,13 +7,11 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        distcheck-conf-flags: [--enable-openmp, --disable-openmp, --enable-mixed-mode, --disable-setting-flags]
+        distcheck-conf-flags: [--enable-openmp, --disable-openmp --disable-setting-flags]
         fcflags: ["-I/usr/include", "-fdefault-real-8 -fdefault-double-8 -fcray-pointer -ffree-line-length-none -I/usr/include"]
         exclude:
           - distcheck-conf-flags: --disable-setting-flags
             fcflags: -I/usr/include
-          - distcheck-conf-flags: --enable-mixed-mode
-            fcflags: "-fdefault-real-8 -fdefault-double-8 -fcray-pointer -ffree-line-length-none -I/usr/include"
     container:
       image: underwoo/ubuntu_libfms_gnu
       env:
diff --git a/affinity/Makefile.am b/affinity/Makefile.am
index eecbc2dcb2..0191e47d24 100644
--- a/affinity/Makefile.am
+++ b/affinity/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build this uninstalled convenience library.
 noinst_LTLIBRARIES = libfms_affinity.la libfms_affinity_c.la
diff --git a/astronomy/Makefile.am b/astronomy/Makefile.am
index 77ed95b42a..dcc56204ce 100644
--- a/astronomy/Makefile.am
+++ b/astronomy/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build this uninstalled convenience library.
 noinst_LTLIBRARIES = libastronomy.la
diff --git a/axis_utils/Makefile.am b/axis_utils/Makefile.am
index 805974f032..4e305f5102 100644
--- a/axis_utils/Makefile.am
+++ b/axis_utils/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build this uninstalled convenience library.
 noinst_LTLIBRARIES = libaxis_utils.la libaxis_utils2.la
diff --git a/block_control/Makefile.am b/block_control/Makefile.am
index cd68b23123..9b21e8994f 100644
--- a/block_control/Makefile.am
+++ b/block_control/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build this uninstalled convenience library.
 noinst_LTLIBRARIES = libblock_control.la
diff --git a/block_control/block_control.F90 b/block_control/block_control.F90
index 4dcae2be9b..bdc12f3ebd 100644
--- a/block_control/block_control.F90
+++ b/block_control/block_control.F90
@@ -21,11 +21,10 @@
 !! \brief Contains the \ref block_control_mod module
 
 module block_control_mod
-#include <fms_platform.h>
 
 use mpp_mod,         only: mpp_error, NOTE, WARNING, FATAL
 use mpp_domains_mod, only: mpp_compute_extent
-
+use platform_mod
 implicit none
 
 public block_control_type
diff --git a/column_diagnostics/Makefile.am b/column_diagnostics/Makefile.am
index 11dd7b679b..edbec7a543 100644
--- a/column_diagnostics/Makefile.am
+++ b/column_diagnostics/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build this uninstalled convenience library.
 noinst_LTLIBRARIES = libcolumn_diagnostics.la
diff --git a/diag_integral/Makefile.am b/diag_integral/Makefile.am
index 34237ef172..2470d315f7 100644
--- a/diag_integral/Makefile.am
+++ b/diag_integral/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build this uninstalled convenience library.
 noinst_LTLIBRARIES = libdiag_integral.la
diff --git a/diag_integral/diag_integral.F90 b/diag_integral/diag_integral.F90
index ca6d3c0897..ff4c2ae54b 100644
--- a/diag_integral/diag_integral.F90
+++ b/diag_integral/diag_integral.F90
@@ -21,7 +21,6 @@
 !! \brief Contains the \ref diag_integral_mod module
 
                      module diag_integral_mod
-#include <fms_platform.h>
 
 
 
@@ -95,6 +94,7 @@ module diag_integral_mod
 !! - format_data_init
 !!
 
+use platform_mod,     only:  i8_kind
 use time_manager_mod, only:  time_type, get_time, set_time,  &
                              time_manager_init, &
                              operator(+),  operator(-),      &
@@ -1149,7 +1149,7 @@ subroutine write_field_averages (Time)
       real    :: xtime, rcount
       integer :: nn, ninc, nst, nend, fields_to_print
       integer :: i, kount
-      integer(LONG_KIND) :: icount
+      integer(i8_kind) :: icount
 
 !-------------------------------------------------------------------------------
 !    each header and data format may be different and must be generated
diff --git a/drifters/Makefile.am b/drifters/Makefile.am
index f898bd2fe9..ac8ae25239 100644
--- a/drifters/Makefile.am
+++ b/drifters/Makefile.am
@@ -26,8 +26,9 @@
 .NOTPARALLEL:
 
 # Include .h and .mod files.
-AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
-
+AM_CPPFLAGS = -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
+AM_CPPFLAGS += -I${top_builddir}/mpp
 
 # Build these uninstalled convenience libraries.
 noinst_LTLIBRARIES = libdrifters_core.la libdrifters_input.la \
diff --git a/drifters/cloud_interpolator.F90 b/drifters/cloud_interpolator.F90
index 947df99b29..5751d11445 100644
--- a/drifters/cloud_interpolator.F90
+++ b/drifters/cloud_interpolator.F90
@@ -21,7 +21,7 @@
 #define _FLATTEN(A) reshape((A), (/size((A))/) )
 
 MODULE cloud_interpolator_mod
-#include <fms_platform.h>
+  use platform_mod
   implicit none
   private
 
diff --git a/drifters/drifters.F90 b/drifters/drifters.F90
index 48240f71f1..c301bf008a 100644
--- a/drifters/drifters.F90
+++ b/drifters/drifters.F90
@@ -22,7 +22,6 @@
 #define _FLATTEN(A) reshape((A), (/size((A))/) )
 
 module drifters_mod
-#include <fms_platform.h>
 ! <CONTACT EMAIL="Alexander.Pletzer@noaa.gov">
 !   Alexander Pletzer
 ! </CONTACT>
@@ -117,7 +116,7 @@ module drifters_mod
                                 drifters_comm_set_domain, drifters_comm_gather, drifters_comm_update
 
   use cloud_interpolator_mod, only: cld_ntrp_linear_cell_interp, cld_ntrp_locate_cell, cld_ntrp_get_cell_values
-
+  use platform_mod
   implicit none
   private
 
diff --git a/drifters/drifters_comm.F90 b/drifters/drifters_comm.F90
index ef914fcfbc..dbddfb9b2d 100644
--- a/drifters/drifters_comm.F90
+++ b/drifters/drifters_comm.F90
@@ -19,7 +19,6 @@
 #include "fms_switches.h"
 
 module drifters_comm_mod
-#include <fms_platform.h>
 
 #ifdef _SERIAL
 
@@ -37,6 +36,7 @@ module drifters_comm_mod
   use mpp_domains_mod, only        : mpp_get_compute_domain, mpp_get_data_domain
   use mpp_domains_mod, only        : NORTH, SOUTH, EAST, WEST, CYCLIC_GLOBAL_DOMAIN
   use mpp_domains_mod, only        : NORTH_EAST, SOUTH_EAST, SOUTH_WEST, NORTH_WEST
+  use platform_mod
 
 #define _TYPE_DOMAIN2D type(domain2d)
 #define _NULL_PE NULL_PE
diff --git a/drifters/drifters_core.F90 b/drifters/drifters_core.F90
index 4817130706..7105a8fa2b 100644
--- a/drifters/drifters_core.F90
+++ b/drifters/drifters_core.F90
@@ -21,7 +21,7 @@
 
 
 module drifters_core_mod
-#include <fms_platform.h>
+  use platform_mod
   implicit none
   private
 
@@ -39,11 +39,11 @@ module drifters_core_mod
   type drifters_core_type
      ! Be sure to update drifters_core_new, drifters_core_del and drifters_core_copy_new
      ! when adding members
-     integer*8 :: it   ! time index
+     integer(kind=i8_kind) :: it   ! time index
      integer :: nd     ! number of dimensions
      integer :: np     ! number of particles (drifters)
      integer :: npdim  ! max number of particles (drifters)
-     integer, allocatable :: ids(:)_NULL  ! particle id number
+     integer, allocatable :: ids(:) ! particle id number
      real   , allocatable :: positions(:,:)
   end type drifters_core_type
 
diff --git a/drifters/drifters_input.F90 b/drifters/drifters_input.F90
index 4894c41168..dfd2de2381 100644
--- a/drifters/drifters_input.F90
+++ b/drifters/drifters_input.F90
@@ -19,7 +19,7 @@
 
 
 module drifters_input_mod
-#include <fms_platform.h>
+  use :: platform_mod
   implicit none
   private
 
@@ -432,6 +432,3 @@ subroutine drifters_input_save(self, filename, geolon, geolat, ermesg)
   end subroutine drifters_input_save
 
 end module drifters_input_mod
-
-!===============================================================================
-!===============================================================================
diff --git a/drifters/drifters_io.F90 b/drifters/drifters_io.F90
index 5c0d87bc2a..93589b1335 100644
--- a/drifters/drifters_io.F90
+++ b/drifters/drifters_io.F90
@@ -16,7 +16,6 @@
 !* You should have received a copy of the GNU Lesser General Public
 !* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
 !***********************************************************************
-!!#include <fms_platform.h>
 
 module drifters_io_mod
 
@@ -302,5 +301,3 @@ subroutine drifters_io_write(self, time, np, nd, nf, ids, positions, fields, erm
   end subroutine drifters_io_write
 
 end module drifters_io_mod
- !###############################################################################
- !###############################################################################
diff --git a/exchange/xgrid.F90 b/exchange/xgrid.F90
index 0414e59ddf..718338ef06 100644
--- a/exchange/xgrid.F90
+++ b/exchange/xgrid.F90
@@ -1,4 +1,4 @@
-!***********************************************************************
+
 !*                   GNU Lesser General Public License
 !*
 !* This file is part of the GFDL Flexible Modeling System (FMS).
diff --git a/field_manager/Makefile.am b/field_manager/Makefile.am
index 9850a511a1..749d51b018 100644
--- a/field_manager/Makefile.am
+++ b/field_manager/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build these uninstalled convenience libraries.
 noinst_LTLIBRARIES = libfield_manager.la libfm_util.la
diff --git a/fms2_io/Makefile.am b/fms2_io/Makefile.am
index b6fcf47d2f..abd94e6c25 100644
--- a/fms2_io/Makefile.am
+++ b/fms2_io/Makefile.am
@@ -38,7 +38,8 @@ include/register_variable_attribute.inc					\
 include/unstructured_domain_write.inc
 
 # Include .h and .mod files.
-AM_CPPFLAGS = -I${top_srcdir}/fms2_io/include -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_srcdir}/fms2_io/include -I${top_builddir}/.mod \
+  -I${top_builddir}/platform
 
 # Build these uninstalled convenience libraries.
 noinst_LTLIBRARIES = libfms2_io.la libfms_io_utils.la libnetcdf_io.la	\
diff --git a/fms2_io/blackboxio.F90 b/fms2_io/blackboxio.F90
index 610db6d206..722e53fcf4 100644
--- a/fms2_io/blackboxio.F90
+++ b/fms2_io/blackboxio.F90
@@ -25,7 +25,8 @@ module blackboxio
 use fms_netcdf_domain_io_mod
 use fms_netcdf_unstructured_domain_io_mod
 use mpp_mod, only: mpp_pe
-use, intrinsic :: iso_fortran_env, only: error_unit, int32, int64, real32, real64
+use, intrinsic :: iso_fortran_env, only: error_unit
+use platform_mod
 implicit none
 private
 
@@ -178,9 +179,9 @@ subroutine copy_metadata(fileobj, new_fileobj)
   integer :: i
   integer :: j
   integer :: k
-  integer(kind=int32), dimension(:), allocatable :: buf_int
-  real(kind=real32), dimension(:), allocatable :: buf_float
-  real(kind=real64), dimension(:), allocatable :: buf_double
+  integer(kind=i4_kind), dimension(:), allocatable :: buf_int
+  real(kind=r4_kind), dimension(:), allocatable :: buf_float
+  real(kind=r8_kind), dimension(:), allocatable :: buf_double
 
   if (fileobj%is_root .and. .not. new_fileobj%is_readonly) then
     !Copy global attributes to the new file.
diff --git a/fms2_io/fms2_io.F90 b/fms2_io/fms2_io.F90
index 89b7085fce..e9fe4d599b 100644
--- a/fms2_io/fms2_io.F90
+++ b/fms2_io/fms2_io.F90
@@ -28,6 +28,8 @@ module fms2_io_mod
 use blackboxio
 use mpp_mod, only: mpp_init, input_nml_file, mpp_error, FATAL
 use mpp_domains_mod, only: mpp_domains_init
+use platform_mod
+
 implicit none
 private
 
diff --git a/fms2_io/fms_io_utils.F90 b/fms2_io/fms_io_utils.F90
index b8e6f2b995..d93ee0c810 100644
--- a/fms2_io/fms_io_utils.F90
+++ b/fms2_io/fms_io_utils.F90
@@ -21,11 +21,12 @@
 
 !> @brief Utility routines.
 module fms_io_utils_mod
-use, intrinsic :: iso_fortran_env, only: error_unit, int32, int64, real32, real64
+use, intrinsic :: iso_fortran_env, only: error_unit
 #ifdef _OPENMP
 use omp_lib
 #endif
 use mpp_mod
+use platform_mod
 implicit none
 private
 
@@ -465,3 +466,4 @@ end subroutine open_check
 
 
 end module fms_io_utils_mod
+
diff --git a/fms2_io/fms_netcdf_domain_io.F90 b/fms2_io/fms_netcdf_domain_io.F90
index 673e96c393..d7f72eb897 100644
--- a/fms2_io/fms_netcdf_domain_io.F90
+++ b/fms2_io/fms_netcdf_domain_io.F90
@@ -27,6 +27,7 @@ module fms_netcdf_domain_io_mod
 use mpp_domains_mod
 use fms_io_utils_mod
 use netcdf_io_mod
+use platform_mod
 implicit none
 private
 
diff --git a/fms2_io/include/array_utils.inc b/fms2_io/include/array_utils.inc
index 3ff7a04f48..c1f2881e9f 100644
--- a/fms2_io/include/array_utils.inc
+++ b/fms2_io/include/array_utils.inc
@@ -20,7 +20,7 @@
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int32_1d(buf, sizes)
 
-  integer(kind=int32), dimension(:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i4_kind), dimension(:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(1), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -33,8 +33,8 @@ end subroutine allocate_array_int32_1d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int32_1d(section, array, s, c)
 
-  integer(kind=int32), dimension(:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int32), dimension(:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i4_kind), dimension(:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i4_kind), dimension(:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(1), intent(in) :: s !< Array of starting indices.
   integer, dimension(1), intent(in) :: c !< Array of sizes.
 
@@ -45,8 +45,8 @@ end subroutine put_array_section_int32_1d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int32_1d(section, array, s, c)
 
-  integer(kind=int32), dimension(:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int32), dimension(:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i4_kind), dimension(:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i4_kind), dimension(:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(1), intent(in) :: s !< Array of starting indices.
   integer, dimension(1), intent(in) :: c !< Array of sizes.
 
@@ -57,7 +57,7 @@ end subroutine get_array_section_int32_1d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int32_2d(buf, sizes)
 
-  integer(kind=int32), dimension(:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i4_kind), dimension(:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(2), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -70,8 +70,8 @@ end subroutine allocate_array_int32_2d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int32_2d(section, array, s, c)
 
-  integer(kind=int32), dimension(:,:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int32), dimension(:,:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i4_kind), dimension(:,:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i4_kind), dimension(:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(2), intent(in) :: s !< Array of starting indices.
   integer, dimension(2), intent(in) :: c !< Array of sizes.
 
@@ -82,8 +82,8 @@ end subroutine put_array_section_int32_2d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int32_2d(section, array, s, c)
 
-  integer(kind=int32), dimension(:,:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int32), dimension(:,:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i4_kind), dimension(:,:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i4_kind), dimension(:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(2), intent(in) :: s !< Array of starting indices.
   integer, dimension(2), intent(in) :: c !< Array of sizes.
 
@@ -94,7 +94,7 @@ end subroutine get_array_section_int32_2d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int32_3d(buf, sizes)
 
-  integer(kind=int32), dimension(:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i4_kind), dimension(:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(3), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -107,8 +107,8 @@ end subroutine allocate_array_int32_3d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int32_3d(section, array, s, c)
 
-  integer(kind=int32), dimension(:,:,:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int32), dimension(:,:,:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i4_kind), dimension(:,:,:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i4_kind), dimension(:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(3), intent(in) :: s !< Array of starting indices.
   integer, dimension(3), intent(in) :: c !< Array of sizes.
 
@@ -119,8 +119,8 @@ end subroutine put_array_section_int32_3d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int32_3d(section, array, s, c)
 
-  integer(kind=int32), dimension(:,:,:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int32), dimension(:,:,:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i4_kind), dimension(:,:,:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i4_kind), dimension(:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(3), intent(in) :: s !< Array of starting indices.
   integer, dimension(3), intent(in) :: c !< Array of sizes.
 
@@ -131,7 +131,7 @@ end subroutine get_array_section_int32_3d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int32_4d(buf, sizes)
 
-  integer(kind=int32), dimension(:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i4_kind), dimension(:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(4), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -144,8 +144,8 @@ end subroutine allocate_array_int32_4d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int32_4d(section, array, s, c)
 
-  integer(kind=int32), dimension(:,:,:,:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int32), dimension(:,:,:,:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i4_kind), dimension(:,:,:,:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i4_kind), dimension(:,:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(4), intent(in) :: s !< Array of starting indices.
   integer, dimension(4), intent(in) :: c !< Array of sizes.
 
@@ -156,8 +156,8 @@ end subroutine put_array_section_int32_4d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int32_4d(section, array, s, c)
 
-  integer(kind=int32), dimension(:,:,:,:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int32), dimension(:,:,:,:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i4_kind), dimension(:,:,:,:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i4_kind), dimension(:,:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(4), intent(in) :: s !< Array of starting indices.
   integer, dimension(4), intent(in) :: c !< Array of sizes.
 
@@ -168,7 +168,7 @@ end subroutine get_array_section_int32_4d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int32_5d(buf, sizes)
 
-  integer(kind=int32), dimension(:,:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i4_kind), dimension(:,:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(5), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -181,8 +181,8 @@ end subroutine allocate_array_int32_5d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int32_5d(section, array, s, c)
 
-  integer(kind=int32), dimension(:,:,:,:,:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int32), dimension(:,:,:,:,:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i4_kind), dimension(:,:,:,:,:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i4_kind), dimension(:,:,:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(5), intent(in) :: s !< Array of starting indices.
   integer, dimension(5), intent(in) :: c !< Array of sizes.
 
@@ -193,8 +193,8 @@ end subroutine put_array_section_int32_5d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int32_5d(section, array, s, c)
 
-  integer(kind=int32), dimension(:,:,:,:,:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int32), dimension(:,:,:,:,:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i4_kind), dimension(:,:,:,:,:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i4_kind), dimension(:,:,:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(5), intent(in) :: s !< Array of starting indices.
   integer, dimension(5), intent(in) :: c !< Array of sizes.
 
@@ -205,7 +205,7 @@ end subroutine get_array_section_int32_5d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int64_1d(buf, sizes)
 
-  integer(kind=int64), dimension(:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i8_kind), dimension(:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(1), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -218,8 +218,8 @@ end subroutine allocate_array_int64_1d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int64_1d(section, array, s, c)
 
-  integer(kind=int64), dimension(:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int64), dimension(:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i8_kind), dimension(:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i8_kind), dimension(:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(1), intent(in) :: s !< Array of starting indices.
   integer, dimension(1), intent(in) :: c !< Array of sizes.
 
@@ -230,8 +230,8 @@ end subroutine put_array_section_int64_1d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int64_1d(section, array, s, c)
 
-  integer(kind=int64), dimension(:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int64), dimension(:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i8_kind), dimension(:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i8_kind), dimension(:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(1), intent(in) :: s !< Array of starting indices.
   integer, dimension(1), intent(in) :: c !< Array of sizes.
 
@@ -242,7 +242,7 @@ end subroutine get_array_section_int64_1d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int64_2d(buf, sizes)
 
-  integer(kind=int64), dimension(:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i8_kind), dimension(:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(2), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -255,8 +255,8 @@ end subroutine allocate_array_int64_2d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int64_2d(section, array, s, c)
 
-  integer(kind=int64), dimension(:,:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int64), dimension(:,:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i8_kind), dimension(:,:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i8_kind), dimension(:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(2), intent(in) :: s !< Array of starting indices.
   integer, dimension(2), intent(in) :: c !< Array of sizes.
 
@@ -267,8 +267,8 @@ end subroutine put_array_section_int64_2d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int64_2d(section, array, s, c)
 
-  integer(kind=int64), dimension(:,:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int64), dimension(:,:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i8_kind), dimension(:,:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i8_kind), dimension(:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(2), intent(in) :: s !< Array of starting indices.
   integer, dimension(2), intent(in) :: c !< Array of sizes.
 
@@ -279,7 +279,7 @@ end subroutine get_array_section_int64_2d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int64_3d(buf, sizes)
 
-  integer(kind=int64), dimension(:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i8_kind), dimension(:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(3), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -292,8 +292,8 @@ end subroutine allocate_array_int64_3d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int64_3d(section, array, s, c)
 
-  integer(kind=int64), dimension(:,:,:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int64), dimension(:,:,:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i8_kind), dimension(:,:,:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i8_kind), dimension(:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(3), intent(in) :: s !< Array of starting indices.
   integer, dimension(3), intent(in) :: c !< Array of sizes.
 
@@ -304,8 +304,8 @@ end subroutine put_array_section_int64_3d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int64_3d(section, array, s, c)
 
-  integer(kind=int64), dimension(:,:,:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int64), dimension(:,:,:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i8_kind), dimension(:,:,:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i8_kind), dimension(:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(3), intent(in) :: s !< Array of starting indices.
   integer, dimension(3), intent(in) :: c !< Array of sizes.
 
@@ -316,7 +316,7 @@ end subroutine get_array_section_int64_3d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int64_4d(buf, sizes)
 
-  integer(kind=int64), dimension(:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i8_kind), dimension(:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(4), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -329,8 +329,8 @@ end subroutine allocate_array_int64_4d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int64_4d(section, array, s, c)
 
-  integer(kind=int64), dimension(:,:,:,:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int64), dimension(:,:,:,:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i8_kind), dimension(:,:,:,:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i8_kind), dimension(:,:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(4), intent(in) :: s !< Array of starting indices.
   integer, dimension(4), intent(in) :: c !< Array of sizes.
 
@@ -341,8 +341,8 @@ end subroutine put_array_section_int64_4d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int64_4d(section, array, s, c)
 
-  integer(kind=int64), dimension(:,:,:,:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int64), dimension(:,:,:,:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i8_kind), dimension(:,:,:,:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i8_kind), dimension(:,:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(4), intent(in) :: s !< Array of starting indices.
   integer, dimension(4), intent(in) :: c !< Array of sizes.
 
@@ -353,7 +353,7 @@ end subroutine get_array_section_int64_4d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_int64_5d(buf, sizes)
 
-  integer(kind=int64), dimension(:,:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  integer(kind=i8_kind), dimension(:,:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(5), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -366,8 +366,8 @@ end subroutine allocate_array_int64_5d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_int64_5d(section, array, s, c)
 
-  integer(kind=int64), dimension(:,:,:,:,:), intent(in) :: section !< Section to be inserted.
-  integer(kind=int64), dimension(:,:,:,:,:), intent(inout) :: array !< Array to insert the section in.
+  integer(kind=i8_kind), dimension(:,:,:,:,:), intent(in) :: section !< Section to be inserted.
+  integer(kind=i8_kind), dimension(:,:,:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(5), intent(in) :: s !< Array of starting indices.
   integer, dimension(5), intent(in) :: c !< Array of sizes.
 
@@ -378,8 +378,8 @@ end subroutine put_array_section_int64_5d
 !> @brief Get a section of larger array.
 subroutine get_array_section_int64_5d(section, array, s, c)
 
-  integer(kind=int64), dimension(:,:,:,:,:), intent(inout) :: section !< Section to be extracted.
-  integer(kind=int64), dimension(:,:,:,:,:), intent(in) :: array !< Array to extract the section from.
+  integer(kind=i8_kind), dimension(:,:,:,:,:), intent(inout) :: section !< Section to be extracted.
+  integer(kind=i8_kind), dimension(:,:,:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(5), intent(in) :: s !< Array of starting indices.
   integer, dimension(5), intent(in) :: c !< Array of sizes.
 
@@ -390,7 +390,7 @@ end subroutine get_array_section_int64_5d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real32_1d(buf, sizes)
 
-  real(kind=real32), dimension(:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r4_kind), dimension(:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(1), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -403,8 +403,8 @@ end subroutine allocate_array_real32_1d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real32_1d(section, array, s, c)
 
-  real(kind=real32), dimension(:), intent(in) :: section !< Section to be inserted.
-  real(kind=real32), dimension(:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r4_kind), dimension(:), intent(in) :: section !< Section to be inserted.
+  real(kind=r4_kind), dimension(:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(1), intent(in) :: s !< Array of starting indices.
   integer, dimension(1), intent(in) :: c !< Array of sizes.
 
@@ -415,8 +415,8 @@ end subroutine put_array_section_real32_1d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real32_1d(section, array, s, c)
 
-  real(kind=real32), dimension(:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real32), dimension(:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r4_kind), dimension(:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r4_kind), dimension(:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(1), intent(in) :: s !< Array of starting indices.
   integer, dimension(1), intent(in) :: c !< Array of sizes.
 
@@ -427,7 +427,7 @@ end subroutine get_array_section_real32_1d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real32_2d(buf, sizes)
 
-  real(kind=real32), dimension(:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r4_kind), dimension(:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(2), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -440,8 +440,8 @@ end subroutine allocate_array_real32_2d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real32_2d(section, array, s, c)
 
-  real(kind=real32), dimension(:,:), intent(in) :: section !< Section to be inserted.
-  real(kind=real32), dimension(:,:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r4_kind), dimension(:,:), intent(in) :: section !< Section to be inserted.
+  real(kind=r4_kind), dimension(:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(2), intent(in) :: s !< Array of starting indices.
   integer, dimension(2), intent(in) :: c !< Array of sizes.
 
@@ -452,8 +452,8 @@ end subroutine put_array_section_real32_2d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real32_2d(section, array, s, c)
 
-  real(kind=real32), dimension(:,:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real32), dimension(:,:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r4_kind), dimension(:,:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r4_kind), dimension(:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(2), intent(in) :: s !< Array of starting indices.
   integer, dimension(2), intent(in) :: c !< Array of sizes.
 
@@ -464,7 +464,7 @@ end subroutine get_array_section_real32_2d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real32_3d(buf, sizes)
 
-  real(kind=real32), dimension(:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r4_kind), dimension(:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(3), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -477,8 +477,8 @@ end subroutine allocate_array_real32_3d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real32_3d(section, array, s, c)
 
-  real(kind=real32), dimension(:,:,:), intent(in) :: section !< Section to be inserted.
-  real(kind=real32), dimension(:,:,:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r4_kind), dimension(:,:,:), intent(in) :: section !< Section to be inserted.
+  real(kind=r4_kind), dimension(:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(3), intent(in) :: s !< Array of starting indices.
   integer, dimension(3), intent(in) :: c !< Array of sizes.
 
@@ -489,8 +489,8 @@ end subroutine put_array_section_real32_3d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real32_3d(section, array, s, c)
 
-  real(kind=real32), dimension(:,:,:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real32), dimension(:,:,:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r4_kind), dimension(:,:,:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r4_kind), dimension(:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(3), intent(in) :: s !< Array of starting indices.
   integer, dimension(3), intent(in) :: c !< Array of sizes.
 
@@ -501,7 +501,7 @@ end subroutine get_array_section_real32_3d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real32_4d(buf, sizes)
 
-  real(kind=real32), dimension(:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r4_kind), dimension(:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(4), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -514,8 +514,8 @@ end subroutine allocate_array_real32_4d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real32_4d(section, array, s, c)
 
-  real(kind=real32), dimension(:,:,:,:), intent(in) :: section !< Section to be inserted.
-  real(kind=real32), dimension(:,:,:,:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r4_kind), dimension(:,:,:,:), intent(in) :: section !< Section to be inserted.
+  real(kind=r4_kind), dimension(:,:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(4), intent(in) :: s !< Array of starting indices.
   integer, dimension(4), intent(in) :: c !< Array of sizes.
 
@@ -526,8 +526,8 @@ end subroutine put_array_section_real32_4d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real32_4d(section, array, s, c)
 
-  real(kind=real32), dimension(:,:,:,:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real32), dimension(:,:,:,:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r4_kind), dimension(:,:,:,:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r4_kind), dimension(:,:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(4), intent(in) :: s !< Array of starting indices.
   integer, dimension(4), intent(in) :: c !< Array of sizes.
 
@@ -538,7 +538,7 @@ end subroutine get_array_section_real32_4d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real32_5d(buf, sizes)
 
-  real(kind=real32), dimension(:,:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r4_kind), dimension(:,:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(5), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -551,8 +551,8 @@ end subroutine allocate_array_real32_5d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real32_5d(section, array, s, c)
 
-  real(kind=real32), dimension(:,:,:,:,:), intent(in) :: section !< Section to be inserted.
-  real(kind=real32), dimension(:,:,:,:,:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r4_kind), dimension(:,:,:,:,:), intent(in) :: section !< Section to be inserted.
+  real(kind=r4_kind), dimension(:,:,:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(5), intent(in) :: s !< Array of starting indices.
   integer, dimension(5), intent(in) :: c !< Array of sizes.
 
@@ -563,8 +563,8 @@ end subroutine put_array_section_real32_5d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real32_5d(section, array, s, c)
 
-  real(kind=real32), dimension(:,:,:,:,:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real32), dimension(:,:,:,:,:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r4_kind), dimension(:,:,:,:,:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r4_kind), dimension(:,:,:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(5), intent(in) :: s !< Array of starting indices.
   integer, dimension(5), intent(in) :: c !< Array of sizes.
 
@@ -575,7 +575,7 @@ end subroutine get_array_section_real32_5d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real64_1d(buf, sizes)
 
-  real(kind=real64), dimension(:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r8_kind), dimension(:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(1), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -588,8 +588,8 @@ end subroutine allocate_array_real64_1d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real64_1d(section, array, s, c)
 
-  real(kind=real64), dimension(:), intent(in) :: section !< Section to be inserted.
-  real(kind=real64), dimension(:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r8_kind), dimension(:), intent(in) :: section !< Section to be inserted.
+  real(kind=r8_kind), dimension(:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(1), intent(in) :: s !< Array of starting indices.
   integer, dimension(1), intent(in) :: c !< Array of sizes.
 
@@ -600,8 +600,8 @@ end subroutine put_array_section_real64_1d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real64_1d(section, array, s, c)
 
-  real(kind=real64), dimension(:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real64), dimension(:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r8_kind), dimension(:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r8_kind), dimension(:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(1), intent(in) :: s !< Array of starting indices.
   integer, dimension(1), intent(in) :: c !< Array of sizes.
 
@@ -612,7 +612,7 @@ end subroutine get_array_section_real64_1d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real64_2d(buf, sizes)
 
-  real(kind=real64), dimension(:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r8_kind), dimension(:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(2), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -625,8 +625,8 @@ end subroutine allocate_array_real64_2d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real64_2d(section, array, s, c)
 
-  real(kind=real64), dimension(:,:), intent(in) :: section !< Section to be inserted.
-  real(kind=real64), dimension(:,:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r8_kind), dimension(:,:), intent(in) :: section !< Section to be inserted.
+  real(kind=r8_kind), dimension(:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(2), intent(in) :: s !< Array of starting indices.
   integer, dimension(2), intent(in) :: c !< Array of sizes.
 
@@ -637,8 +637,8 @@ end subroutine put_array_section_real64_2d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real64_2d(section, array, s, c)
 
-  real(kind=real64), dimension(:,:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real64), dimension(:,:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r8_kind), dimension(:,:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r8_kind), dimension(:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(2), intent(in) :: s !< Array of starting indices.
   integer, dimension(2), intent(in) :: c !< Array of sizes.
 
@@ -649,7 +649,7 @@ end subroutine get_array_section_real64_2d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real64_3d(buf, sizes)
 
-  real(kind=real64), dimension(:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r8_kind), dimension(:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(3), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -662,8 +662,8 @@ end subroutine allocate_array_real64_3d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real64_3d(section, array, s, c)
 
-  real(kind=real64), dimension(:,:,:), intent(in) :: section !< Section to be inserted.
-  real(kind=real64), dimension(:,:,:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r8_kind), dimension(:,:,:), intent(in) :: section !< Section to be inserted.
+  real(kind=r8_kind), dimension(:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(3), intent(in) :: s !< Array of starting indices.
   integer, dimension(3), intent(in) :: c !< Array of sizes.
 
@@ -674,8 +674,8 @@ end subroutine put_array_section_real64_3d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real64_3d(section, array, s, c)
 
-  real(kind=real64), dimension(:,:,:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real64), dimension(:,:,:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r8_kind), dimension(:,:,:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r8_kind), dimension(:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(3), intent(in) :: s !< Array of starting indices.
   integer, dimension(3), intent(in) :: c !< Array of sizes.
 
@@ -686,7 +686,7 @@ end subroutine get_array_section_real64_3d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real64_4d(buf, sizes)
 
-  real(kind=real64), dimension(:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r8_kind), dimension(:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(4), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -699,8 +699,8 @@ end subroutine allocate_array_real64_4d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real64_4d(section, array, s, c)
 
-  real(kind=real64), dimension(:,:,:,:), intent(in) :: section !< Section to be inserted.
-  real(kind=real64), dimension(:,:,:,:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r8_kind), dimension(:,:,:,:), intent(in) :: section !< Section to be inserted.
+  real(kind=r8_kind), dimension(:,:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(4), intent(in) :: s !< Array of starting indices.
   integer, dimension(4), intent(in) :: c !< Array of sizes.
 
@@ -711,8 +711,8 @@ end subroutine put_array_section_real64_4d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real64_4d(section, array, s, c)
 
-  real(kind=real64), dimension(:,:,:,:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real64), dimension(:,:,:,:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r8_kind), dimension(:,:,:,:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r8_kind), dimension(:,:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(4), intent(in) :: s !< Array of starting indices.
   integer, dimension(4), intent(in) :: c !< Array of sizes.
 
@@ -723,7 +723,7 @@ end subroutine get_array_section_real64_4d
 !> @brief Allocate arrays using an input array of sizes.
 subroutine allocate_array_real64_5d(buf, sizes)
 
-  real(kind=real64), dimension(:,:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
+  real(kind=r8_kind), dimension(:,:,:,:,:), allocatable, intent(inout) :: buf !< Array that will be allocated.
   integer, dimension(5), intent(in) :: sizes !< Array of dimension sizes.
 
   if (allocated(buf)) then
@@ -736,8 +736,8 @@ end subroutine allocate_array_real64_5d
 !> @brief Put a section of an array into a larger array.
 subroutine put_array_section_real64_5d(section, array, s, c)
 
-  real(kind=real64), dimension(:,:,:,:,:), intent(in) :: section !< Section to be inserted.
-  real(kind=real64), dimension(:,:,:,:,:), intent(inout) :: array !< Array to insert the section in.
+  real(kind=r8_kind), dimension(:,:,:,:,:), intent(in) :: section !< Section to be inserted.
+  real(kind=r8_kind), dimension(:,:,:,:,:), intent(inout) :: array !< Array to insert the section in.
   integer, dimension(5), intent(in) :: s !< Array of starting indices.
   integer, dimension(5), intent(in) :: c !< Array of sizes.
 
@@ -748,12 +748,10 @@ end subroutine put_array_section_real64_5d
 !> @brief Get a section of larger array.
 subroutine get_array_section_real64_5d(section, array, s, c)
 
-  real(kind=real64), dimension(:,:,:,:,:), intent(inout) :: section !< Section to be extracted.
-  real(kind=real64), dimension(:,:,:,:,:), intent(in) :: array !< Array to extract the section from.
+  real(kind=r8_kind), dimension(:,:,:,:,:), intent(inout) :: section !< Section to be extracted.
+  real(kind=r8_kind), dimension(:,:,:,:,:), intent(in) :: array !< Array to extract the section from.
   integer, dimension(5), intent(in) :: s !< Array of starting indices.
   integer, dimension(5), intent(in) :: c !< Array of sizes.
 
   section(:,:,:,:,:) = array(s(1):s(1)+c(1)-1 ,s(2):s(2)+c(2)-1 ,s(3):s(3)+c(3)-1 ,s(4):s(4)+c(4)-1 ,s(5):s(5)+c(5)-1 )
 end subroutine get_array_section_real64_5d
-
-
diff --git a/fms2_io/include/compute_global_checksum.inc b/fms2_io/include/compute_global_checksum.inc
index 1e98f436f6..8a32ed5b92 100644
--- a/fms2_io/include/compute_global_checksum.inc
+++ b/fms2_io/include/compute_global_checksum.inc
@@ -44,15 +44,15 @@ function compute_global_checksum_2d(fileobj, variable_name, variable_data, is_de
   logical :: extra_y
   integer, dimension(2) :: c
   integer, dimension(2) :: e
-  integer(kind=int32), dimension(:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:), allocatable :: buf_real64
-  integer(kind=int32) :: fill_int32
-  integer(kind=int64) :: fill_int64
-  real(kind=real32) :: fill_real32
-  real(kind=real64) :: fill_real64
-  integer(kind=int64) :: chksum_val
+  integer(kind=i4_kind), dimension(:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:), allocatable :: buf_r8_kind
+  integer(kind=i4_kind) :: fill_i4_kind
+  integer(kind=i8_kind) :: fill_i8_kind
+  real(kind=r4_kind) :: fill_r4_kind
+  real(kind=r8_kind) :: fill_r8_kind
+  integer(kind=i8_kind) :: chksum_val
 
   is_decomposed = is_variable_domain_decomposed(fileobj, variable_name, &
                                                 xindex=xdim, yindex=ydim, &
@@ -88,46 +88,42 @@ function compute_global_checksum_2d(fileobj, variable_name, variable_data, is_de
   endif
 
   select type (variable_data)
-    type is (integer(kind=int32))
-      call allocate_array(buf_int32, e)
-      call get_array_section(buf_int32, variable_data, c, e)
-      if (get_fill_value(fileobj, variable_name, fill_int32)) then
-        chksum_val = mpp_chksum(buf_int32, mask_val=fill_int32)
+    type is (integer(kind=i4_kind))
+      call allocate_array(buf_i4_kind, e)
+      call get_array_section(buf_i4_kind, variable_data, c, e)
+      if (get_fill_value(fileobj, variable_name, fill_i4_kind)) then
+        chksum_val = mpp_chksum(buf_i4_kind, mask_val=fill_i4_kind)
       else
-        chksum_val = mpp_chksum(buf_int32)
+        chksum_val = mpp_chksum(buf_i4_kind)
       endif
-      deallocate(buf_int32)
-    type is (integer(kind=int64))
-      call allocate_array(buf_int64, e)
-      call get_array_section(buf_int64, variable_data, c, e)
-      if (get_fill_value(fileobj, variable_name, fill_int64)) then
-        chksum_val = mpp_chksum(buf_int64, mask_val=fill_int64)
+      deallocate(buf_i4_kind)
+    type is (integer(kind=i8_kind))
+      call allocate_array(buf_i8_kind, e)
+      call get_array_section(buf_i8_kind, variable_data, c, e)
+      if (get_fill_value(fileobj, variable_name, fill_i8_kind)) then
+        chksum_val = mpp_chksum(buf_i8_kind, mask_val=fill_i8_kind)
       else
-        chksum_val = mpp_chksum(buf_int64)
+        chksum_val = mpp_chksum(buf_i8_kind)
       endif
-      deallocate(buf_int64)
-   type is (real(kind=real32))
-#ifdef OVERLOAD_R4
-     call allocate_array(buf_real32, e)
-     call get_array_section(buf_real32, variable_data, c, e)
-     if (get_fill_value(fileobj, variable_name, fill_real32)) then
-       chksum_val = mpp_chksum(buf_real32, mask_val=fill_real32)
+      deallocate(buf_i8_kind)
+   type is (real(kind=r4_kind))
+     call allocate_array(buf_r4_kind, e)
+     call get_array_section(buf_r4_kind, variable_data, c, e)
+     if (get_fill_value(fileobj, variable_name, fill_r4_kind)) then
+       chksum_val = mpp_chksum(buf_r4_kind, mask_val=fill_r4_kind)
      else
-       chksum_val = mpp_chksum(buf_real32)
+       chksum_val = mpp_chksum(buf_r4_kind)
      endif
-     deallocate(buf_real32)
-#else
-     call error("Compute_global_checksum: you are trying to use a real*4 without defining OVERLOAD_R4")
-#endif
-    type is (real(kind=real64))
-      call allocate_array(buf_real64, e)
-      call get_array_section(buf_real64, variable_data, c, e)
-      if (get_fill_value(fileobj, variable_name, fill_real64)) then
-        chksum_val = mpp_chksum(buf_real64, mask_val=fill_real64)
+     deallocate(buf_r4_kind)
+   type is (real(kind=r8_kind))
+      call allocate_array(buf_r8_kind, e)
+      call get_array_section(buf_r8_kind, variable_data, c, e)
+      if (get_fill_value(fileobj, variable_name, fill_r8_kind)) then
+        chksum_val = mpp_chksum(buf_r8_kind, mask_val=fill_r8_kind)
       else
-        chksum_val = mpp_chksum(buf_real64)
+        chksum_val = mpp_chksum(buf_r8_kind)
       endif
-      deallocate(buf_real64)
+      deallocate(buf_r8_kind)
     class default
       call error("Compute_global_checksum: unsupported type.")
   end select
@@ -163,15 +159,15 @@ function compute_global_checksum_3d(fileobj, variable_name, variable_data, is_de
   logical :: extra_y
   integer, dimension(3) :: c
   integer, dimension(3) :: e
-  integer(kind=int32), dimension(:,:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:,:), allocatable :: buf_real64
-  integer(kind=int32) :: fill_int32
-  integer(kind=int64) :: fill_int64
-  real(kind=real32) :: fill_real32
-  real(kind=real64) :: fill_real64
-  integer(kind=int64) :: chksum_val
+  integer(kind=i4_kind), dimension(:,:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:), allocatable :: buf_r8_kind
+  integer(kind=i4_kind) :: fill_i4_kind
+  integer(kind=i8_kind) :: fill_i8_kind
+  real(kind=r4_kind) :: fill_r4_kind
+  real(kind=r8_kind) :: fill_r8_kind
+  integer(kind=i8_kind) :: chksum_val
 
   is_decomposed = is_variable_domain_decomposed(fileobj, variable_name, &
                                                 xindex=xdim, yindex=ydim, &
@@ -206,46 +202,42 @@ function compute_global_checksum_3d(fileobj, variable_name, variable_data, is_de
     e(ydim) = e(ydim) - 1
   endif
   select type (variable_data)
-    type is (integer(kind=int32))
-      call allocate_array(buf_int32, e)
-      call get_array_section(buf_int32, variable_data, c, e)
-      if (get_fill_value(fileobj, variable_name, fill_int32)) then
-        chksum_val = mpp_chksum(buf_int32, mask_val=fill_int32)
+    type is (integer(kind=i4_kind))
+      call allocate_array(buf_i4_kind, e)
+      call get_array_section(buf_i4_kind, variable_data, c, e)
+      if (get_fill_value(fileobj, variable_name, fill_i4_kind)) then
+        chksum_val = mpp_chksum(buf_i4_kind, mask_val=fill_i4_kind)
       else
-        chksum_val = mpp_chksum(buf_int32)
+        chksum_val = mpp_chksum(buf_i4_kind)
       endif
-      deallocate(buf_int32)
-    type is (integer(kind=int64))
-      call allocate_array(buf_int64, e)
-      call get_array_section(buf_int64, variable_data, c, e)
-      if (get_fill_value(fileobj, variable_name, fill_int64)) then
-        chksum_val = mpp_chksum(buf_int64, mask_val=fill_int64)
+      deallocate(buf_i4_kind)
+    type is (integer(kind=i8_kind))
+      call allocate_array(buf_i8_kind, e)
+      call get_array_section(buf_i8_kind, variable_data, c, e)
+      if (get_fill_value(fileobj, variable_name, fill_i8_kind)) then
+        chksum_val = mpp_chksum(buf_i8_kind, mask_val=fill_i8_kind)
       else
-        chksum_val = mpp_chksum(buf_int64)
+        chksum_val = mpp_chksum(buf_i8_kind)
       endif
-      deallocate(buf_int64)
-   type is (real(kind=real32))
-#ifdef OVERLOAD_R4
-     call allocate_array(buf_real32, e)
-     call get_array_section(buf_real32, variable_data, c, e)
-     if (get_fill_value(fileobj, variable_name, fill_real32)) then
-       chksum_val = mpp_chksum(buf_real32, mask_val=fill_real32)
+      deallocate(buf_i8_kind)
+   type is (real(kind=r4_kind))
+     call allocate_array(buf_r4_kind, e)
+     call get_array_section(buf_r4_kind, variable_data, c, e)
+     if (get_fill_value(fileobj, variable_name, fill_r4_kind)) then
+       chksum_val = mpp_chksum(buf_r4_kind, mask_val=fill_r4_kind)
      else
-       chksum_val = mpp_chksum(buf_real32)
+       chksum_val = mpp_chksum(buf_r4_kind)
      endif
-     deallocate(buf_real32)
-#else
-     call error("Compute_global_checksum: you are trying to use a real*4 without defining OVERLOAD_R4")
-#endif
-    type is (real(kind=real64))
-      call allocate_array(buf_real64, e)
-      call get_array_section(buf_real64, variable_data, c, e)
-      if (get_fill_value(fileobj, variable_name, fill_real64)) then
-        chksum_val = mpp_chksum(buf_real64, mask_val=fill_real64)
+     deallocate(buf_r4_kind)
+   type is (real(kind=r8_kind))
+      call allocate_array(buf_r8_kind, e)
+      call get_array_section(buf_r8_kind, variable_data, c, e)
+      if (get_fill_value(fileobj, variable_name, fill_r8_kind)) then
+        chksum_val = mpp_chksum(buf_r8_kind, mask_val=fill_r8_kind)
       else
-        chksum_val = mpp_chksum(buf_real64)
+        chksum_val = mpp_chksum(buf_r8_kind)
       endif
-      deallocate(buf_real64)
+      deallocate(buf_r8_kind)
     class default
       call error("Compute_global_checksum: unsupported type.")
   end select
@@ -281,15 +273,15 @@ function compute_global_checksum_4d(fileobj, variable_name, variable_data, is_de
   logical :: extra_y
   integer, dimension(4) :: c
   integer, dimension(4) :: e
-  integer(kind=int32), dimension(:,:,:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:,:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:,:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:,:,:), allocatable :: buf_real64
-  integer(kind=int32) :: fill_int32
-  integer(kind=int64) :: fill_int64
-  real(kind=real32) :: fill_real32
-  real(kind=real64) :: fill_real64
-  integer(kind=int64) :: chksum_val
+  integer(kind=i4_kind), dimension(:,:,:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:,:), allocatable :: buf_r8_kind
+  integer(kind=i4_kind) :: fill_i4_kind
+  integer(kind=i8_kind) :: fill_i8_kind
+  real(kind=r4_kind) :: fill_r4_kind
+  real(kind=r8_kind) :: fill_r8_kind
+  integer(kind=i8_kind) :: chksum_val
 
   is_decomposed = is_variable_domain_decomposed(fileobj, variable_name, &
                                                 xindex=xdim, yindex=ydim, &
@@ -325,46 +317,42 @@ function compute_global_checksum_4d(fileobj, variable_name, variable_data, is_de
   endif
 
   select type (variable_data)
-    type is (integer(kind=int32))
-      call allocate_array(buf_int32, e)
-      call get_array_section(buf_int32, variable_data, c, e)
-      if (get_fill_value(fileobj, variable_name, fill_int32)) then
-        chksum_val = mpp_chksum(buf_int32, mask_val=fill_int32)
+    type is (integer(kind=i4_kind))
+      call allocate_array(buf_i4_kind, e)
+      call get_array_section(buf_i4_kind, variable_data, c, e)
+      if (get_fill_value(fileobj, variable_name, fill_i4_kind)) then
+        chksum_val = mpp_chksum(buf_i4_kind, mask_val=fill_i4_kind)
       else
-        chksum_val = mpp_chksum(buf_int32)
+        chksum_val = mpp_chksum(buf_i4_kind)
       endif
-      deallocate(buf_int32)
-    type is (integer(kind=int64))
-      call allocate_array(buf_int64, e)
-      call get_array_section(buf_int64, variable_data, c, e)
-      if (get_fill_value(fileobj, variable_name, fill_int64)) then
-        chksum_val = mpp_chksum(buf_int64, mask_val=fill_int64)
+      deallocate(buf_i4_kind)
+    type is (integer(kind=i8_kind))
+      call allocate_array(buf_i8_kind, e)
+      call get_array_section(buf_i8_kind, variable_data, c, e)
+      if (get_fill_value(fileobj, variable_name, fill_i8_kind)) then
+        chksum_val = mpp_chksum(buf_i8_kind, mask_val=fill_i8_kind)
       else
-        chksum_val = mpp_chksum(buf_int64)
+        chksum_val = mpp_chksum(buf_i8_kind)
       endif
-      deallocate(buf_int64)
-   type is (real(kind=real32))
-#ifdef OVERLOAD_R4
-     call allocate_array(buf_real32, e)
-     call get_array_section(buf_real32, variable_data, c, e)
-     if (get_fill_value(fileobj, variable_name, fill_real32)) then
-       chksum_val = mpp_chksum(buf_real32, mask_val=fill_real32)
+      deallocate(buf_i8_kind)
+   type is (real(kind=r4_kind))
+     call allocate_array(buf_r4_kind, e)
+     call get_array_section(buf_r4_kind, variable_data, c, e)
+     if (get_fill_value(fileobj, variable_name, fill_r4_kind)) then
+       chksum_val = mpp_chksum(buf_r4_kind, mask_val=fill_r4_kind)
      else
-       chksum_val = mpp_chksum(buf_real32)
+       chksum_val = mpp_chksum(buf_r4_kind)
      endif
-     deallocate(buf_real32)
-#else
-     call error("Compute_global_checksum: you are trying to use a real*4 without defining OVERLOAD_R4")
-#endif
-    type is (real(kind=real64))
-      call allocate_array(buf_real64, e)
-      call get_array_section(buf_real64, variable_data, c, e)
-      if (get_fill_value(fileobj, variable_name, fill_real64)) then
-        chksum_val = mpp_chksum(buf_real64, mask_val=fill_real64)
+     deallocate(buf_r4_kind)
+   type is (real(kind=r8_kind))
+      call allocate_array(buf_r8_kind, e)
+      call get_array_section(buf_r8_kind, variable_data, c, e)
+      if (get_fill_value(fileobj, variable_name, fill_r8_kind)) then
+        chksum_val = mpp_chksum(buf_r8_kind, mask_val=fill_r8_kind)
       else
-        chksum_val = mpp_chksum(buf_real64)
+        chksum_val = mpp_chksum(buf_r8_kind)
       endif
-      deallocate(buf_real64)
+      deallocate(buf_r8_kind)
     class default
       call error("Compute_global_checksum: unsupported type.")
   end select
diff --git a/fms2_io/include/domain_read.inc b/fms2_io/include/domain_read.inc
index e7b93310eb..98fb44e604 100644
--- a/fms2_io/include/domain_read.inc
+++ b/fms2_io/include/domain_read.inc
@@ -124,10 +124,10 @@ subroutine domain_read_2d(fileobj, variable_name, vdata, unlim_dim_level, &
   integer, dimension(:), allocatable :: pe_jcsize
   integer, dimension(2) :: c
   integer, dimension(2) :: e
-  integer(kind=int32), dimension(:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:), allocatable :: buf_real64
+  integer(kind=i4_kind), dimension(:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:), allocatable :: buf_r8_kind
   logical :: buffer_includes_halos
   integer :: xgmin !< Starting x index of global io domain
   integer :: ygmin !< Starting y index of global io domain
@@ -165,10 +165,10 @@ subroutine domain_read_2d(fileobj, variable_name, vdata, unlim_dim_level, &
       e(xdim_index) = pe_icsize(i)
       e(ydim_index) = pe_jcsize(i)
       select type(vdata)
-        type is (integer(kind=int32))
+        type is (integer(kind=i4_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_int32, e)
-          call netcdf_read_data(fileobj, variable_name, buf_int32, &
+          call allocate_array(buf_i4_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_i4_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -181,17 +181,17 @@ subroutine domain_read_2d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_int32, vdata, c, e)
+            call put_array_section(buf_i4_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_int32, size(buf_int32), fileobj%pelist(i))
+            call mpp_send(buf_i4_kind, size(buf_i4_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_int32)
-        type is (integer(kind=int64))
+          deallocate(buf_i4_kind)
+        type is (integer(kind=i8_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_int64, e)
-          call netcdf_read_data(fileobj, variable_name, buf_int64, &
+          call allocate_array(buf_i8_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_i8_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -204,17 +204,17 @@ subroutine domain_read_2d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_int64, vdata, c, e)
+            call put_array_section(buf_i8_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_int64, size(buf_int64), fileobj%pelist(i))
+            call mpp_send(buf_i8_kind, size(buf_i8_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_int64)
-        type is (real(kind=real32))
+          deallocate(buf_i8_kind)
+        type is (real(kind=r4_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_real32, e)
-          call netcdf_read_data(fileobj, variable_name, buf_real32, &
+          call allocate_array(buf_r4_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_r4_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -227,17 +227,17 @@ subroutine domain_read_2d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_real32, vdata, c, e)
+            call put_array_section(buf_r4_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_real32, size(buf_real32), fileobj%pelist(i))
+            call mpp_send(buf_r4_kind, size(buf_r4_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_real32)
-        type is (real(kind=real64))
+          deallocate(buf_r4_kind)
+        type is (real(kind=r8_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_real64, e)
-          call netcdf_read_data(fileobj, variable_name, buf_real64, &
+          call allocate_array(buf_r8_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_r8_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -250,13 +250,13 @@ subroutine domain_read_2d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_real64, vdata, c, e)
+            call put_array_section(buf_r8_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_real64, size(buf_real64), fileobj%pelist(i))
+            call mpp_send(buf_r8_kind, size(buf_r8_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_real64)
+          deallocate(buf_r8_kind)
         class default
           call error("domain_read_2d: Unsupported type for variable: "//variable_name//" in file: "//trim(fileobj%path)//"")
       end select
@@ -273,26 +273,26 @@ subroutine domain_read_2d(fileobj, variable_name, vdata, unlim_dim_level, &
     e(xdim_index) = xc_size
     e(ydim_index) = yc_size
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(buf_int32, e)
-        call mpp_recv(buf_int32, size(buf_int32), fileobj%io_root, block=.true.)
-        call put_array_section(buf_int32, vdata, c, e)
-        deallocate(buf_int32)
-      type is (integer(kind=int64))
-        call allocate_array(buf_int64, e)
-        call mpp_recv(buf_int64, size(buf_int64), fileobj%io_root, block=.true.)
-        call put_array_section(buf_int64, vdata, c, e)
-        deallocate(buf_int64)
-      type is (real(kind=real32))
-        call allocate_array(buf_real32, e)
-        call mpp_recv(buf_real32, size(buf_real32), fileobj%io_root, block=.true.)
-        call put_array_section(buf_real32, vdata, c, e)
-        deallocate(buf_real32)
-      type is (real(kind=real64))
-        call allocate_array(buf_real64, e)
-        call mpp_recv(buf_real64, size(buf_real64), fileobj%io_root, block=.true.)
-        call put_array_section(buf_real64, vdata, c, e)
-        deallocate(buf_real64)
+      type is (integer(kind=i4_kind))
+        call allocate_array(buf_i4_kind, e)
+        call mpp_recv(buf_i4_kind, size(buf_i4_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_i4_kind, vdata, c, e)
+        deallocate(buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call allocate_array(buf_i8_kind, e)
+        call mpp_recv(buf_i8_kind, size(buf_i8_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_i8_kind, vdata, c, e)
+        deallocate(buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call allocate_array(buf_r4_kind, e)
+        call mpp_recv(buf_r4_kind, size(buf_r4_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_r4_kind, vdata, c, e)
+        deallocate(buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call allocate_array(buf_r8_kind, e)
+        call mpp_recv(buf_r8_kind, size(buf_r8_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_r8_kind, vdata, c, e)
+        deallocate(buf_r8_kind)
       class default
           call error("domain_read_2d: Unsupported type for variable: "//variable_name//" in file: "//trim(fileobj%path)//"")
     end select
@@ -342,10 +342,10 @@ subroutine domain_read_3d(fileobj, variable_name, vdata, unlim_dim_level, &
   integer, dimension(:), allocatable :: pe_jcsize
   integer, dimension(3) :: c
   integer, dimension(3) :: e
-  integer(kind=int32), dimension(:,:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:,:), allocatable :: buf_real64
+  integer(kind=i4_kind), dimension(:,:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:), allocatable :: buf_r8_kind
   logical :: buffer_includes_halos
   integer :: xgmin !< Starting x index of global io domain
   integer :: ygmin !< Starting y index of global io domain
@@ -383,10 +383,10 @@ subroutine domain_read_3d(fileobj, variable_name, vdata, unlim_dim_level, &
       e(xdim_index) = pe_icsize(i)
       e(ydim_index) = pe_jcsize(i)
       select type(vdata)
-        type is (integer(kind=int32))
+        type is (integer(kind=i4_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_int32, e)
-          call netcdf_read_data(fileobj, variable_name, buf_int32, &
+          call allocate_array(buf_i4_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_i4_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -399,17 +399,17 @@ subroutine domain_read_3d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_int32, vdata, c, e)
+            call put_array_section(buf_i4_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_int32, size(buf_int32), fileobj%pelist(i))
+            call mpp_send(buf_i4_kind, size(buf_i4_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_int32)
-        type is (integer(kind=int64))
+          deallocate(buf_i4_kind)
+        type is (integer(kind=i8_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_int64, e)
-          call netcdf_read_data(fileobj, variable_name, buf_int64, &
+          call allocate_array(buf_i8_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_i8_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -422,17 +422,17 @@ subroutine domain_read_3d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_int64, vdata, c, e)
+            call put_array_section(buf_i8_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_int64, size(buf_int64), fileobj%pelist(i))
+            call mpp_send(buf_i8_kind, size(buf_i8_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_int64)
-        type is (real(kind=real32))
+          deallocate(buf_i8_kind)
+        type is (real(kind=r4_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_real32, e)
-          call netcdf_read_data(fileobj, variable_name, buf_real32, &
+          call allocate_array(buf_r4_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_r4_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -445,17 +445,17 @@ subroutine domain_read_3d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_real32, vdata, c, e)
+            call put_array_section(buf_r4_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_real32, size(buf_real32), fileobj%pelist(i))
+            call mpp_send(buf_r4_kind, size(buf_r4_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_real32)
-        type is (real(kind=real64))
+          deallocate(buf_r4_kind)
+        type is (real(kind=r8_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_real64, e)
-          call netcdf_read_data(fileobj, variable_name, buf_real64, &
+          call allocate_array(buf_r8_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_r8_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -468,13 +468,13 @@ subroutine domain_read_3d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_real64, vdata, c, e)
+            call put_array_section(buf_r8_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_real64, size(buf_real64), fileobj%pelist(i))
+            call mpp_send(buf_r8_kind, size(buf_r8_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_real64)
+          deallocate(buf_r8_kind)
         class default
           call error("domain_read_3d: Unsupported type for variable: "//variable_name//" in file: "//trim(fileobj%path)//"")
       end select
@@ -491,26 +491,26 @@ subroutine domain_read_3d(fileobj, variable_name, vdata, unlim_dim_level, &
     e(xdim_index) = xc_size
     e(ydim_index) = yc_size
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(buf_int32, e)
-        call mpp_recv(buf_int32, size(buf_int32), fileobj%io_root, block=.true.)
-        call put_array_section(buf_int32, vdata, c, e)
-        deallocate(buf_int32)
-      type is (integer(kind=int64))
-        call allocate_array(buf_int64, e)
-        call mpp_recv(buf_int64, size(buf_int64), fileobj%io_root, block=.true.)
-        call put_array_section(buf_int64, vdata, c, e)
-        deallocate(buf_int64)
-      type is (real(kind=real32))
-        call allocate_array(buf_real32, e)
-        call mpp_recv(buf_real32, size(buf_real32), fileobj%io_root, block=.true.)
-        call put_array_section(buf_real32, vdata, c, e)
-        deallocate(buf_real32)
-      type is (real(kind=real64))
-        call allocate_array(buf_real64, e)
-        call mpp_recv(buf_real64, size(buf_real64), fileobj%io_root, block=.true.)
-        call put_array_section(buf_real64, vdata, c, e)
-        deallocate(buf_real64)
+      type is (integer(kind=i4_kind))
+        call allocate_array(buf_i4_kind, e)
+        call mpp_recv(buf_i4_kind, size(buf_i4_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_i4_kind, vdata, c, e)
+        deallocate(buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call allocate_array(buf_i8_kind, e)
+        call mpp_recv(buf_i8_kind, size(buf_i8_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_i8_kind, vdata, c, e)
+        deallocate(buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call allocate_array(buf_r4_kind, e)
+        call mpp_recv(buf_r4_kind, size(buf_r4_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_r4_kind, vdata, c, e)
+        deallocate(buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call allocate_array(buf_r8_kind, e)
+        call mpp_recv(buf_r8_kind, size(buf_r8_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_r8_kind, vdata, c, e)
+        deallocate(buf_r8_kind)
       class default
           call error("domain_read_3d: Unsupported type for variable: "//variable_name//" in file: "//trim(fileobj%path)//"")
     end select
@@ -560,10 +560,10 @@ subroutine domain_read_4d(fileobj, variable_name, vdata, unlim_dim_level, &
   integer, dimension(:), allocatable :: pe_jcsize
   integer, dimension(4) :: c
   integer, dimension(4) :: e
-  integer(kind=int32), dimension(:,:,:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:,:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:,:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:,:,:), allocatable :: buf_real64
+  integer(kind=i4_kind), dimension(:,:,:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:,:), allocatable :: buf_r8_kind
   logical :: buffer_includes_halos
   integer :: xgmin !< Starting x index of global io domain
   integer :: ygmin !< Starting y index of global io domain
@@ -601,10 +601,10 @@ subroutine domain_read_4d(fileobj, variable_name, vdata, unlim_dim_level, &
       e(xdim_index) = pe_icsize(i)
       e(ydim_index) = pe_jcsize(i)
       select type(vdata)
-        type is (integer(kind=int32))
+        type is (integer(kind=i4_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_int32, e)
-          call netcdf_read_data(fileobj, variable_name, buf_int32, &
+          call allocate_array(buf_i4_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_i4_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -617,17 +617,17 @@ subroutine domain_read_4d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_int32, vdata, c, e)
+            call put_array_section(buf_i4_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_int32, size(buf_int32), fileobj%pelist(i))
+            call mpp_send(buf_i4_kind, size(buf_i4_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_int32)
-        type is (integer(kind=int64))
+          deallocate(buf_i4_kind)
+        type is (integer(kind=i8_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_int64, e)
-          call netcdf_read_data(fileobj, variable_name, buf_int64, &
+          call allocate_array(buf_i8_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_i8_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -640,17 +640,17 @@ subroutine domain_read_4d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_int64, vdata, c, e)
+            call put_array_section(buf_i8_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_int64, size(buf_int64), fileobj%pelist(i))
+            call mpp_send(buf_i8_kind, size(buf_i8_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_int64)
-        type is (real(kind=real32))
+          deallocate(buf_i8_kind)
+        type is (real(kind=r4_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_real32, e)
-          call netcdf_read_data(fileobj, variable_name, buf_real32, &
+          call allocate_array(buf_r4_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_r4_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -663,17 +663,17 @@ subroutine domain_read_4d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_real32, vdata, c, e)
+            call put_array_section(buf_r4_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_real32, size(buf_real32), fileobj%pelist(i))
+            call mpp_send(buf_r4_kind, size(buf_r4_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_real32)
-        type is (real(kind=real64))
+          deallocate(buf_r4_kind)
+        type is (real(kind=r8_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_real64, e)
-          call netcdf_read_data(fileobj, variable_name, buf_real64, &
+          call allocate_array(buf_r8_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_r8_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -686,13 +686,13 @@ subroutine domain_read_4d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_real64, vdata, c, e)
+            call put_array_section(buf_r8_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_real64, size(buf_real64), fileobj%pelist(i))
+            call mpp_send(buf_r8_kind, size(buf_r8_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_real64)
+          deallocate(buf_r8_kind)
         class default
           call error("domain_read_4d: Unsupported type for variable: "//variable_name//" in file: "//trim(fileobj%path)//"")
       end select
@@ -709,26 +709,26 @@ subroutine domain_read_4d(fileobj, variable_name, vdata, unlim_dim_level, &
     e(xdim_index) = xc_size
     e(ydim_index) = yc_size
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(buf_int32, e)
-        call mpp_recv(buf_int32, size(buf_int32), fileobj%io_root, block=.true.)
-        call put_array_section(buf_int32, vdata, c, e)
-        deallocate(buf_int32)
-      type is (integer(kind=int64))
-        call allocate_array(buf_int64, e)
-        call mpp_recv(buf_int64, size(buf_int64), fileobj%io_root, block=.true.)
-        call put_array_section(buf_int64, vdata, c, e)
-        deallocate(buf_int64)
-      type is (real(kind=real32))
-        call allocate_array(buf_real32, e)
-        call mpp_recv(buf_real32, size(buf_real32), fileobj%io_root, block=.true.)
-        call put_array_section(buf_real32, vdata, c, e)
-        deallocate(buf_real32)
-      type is (real(kind=real64))
-        call allocate_array(buf_real64, e)
-        call mpp_recv(buf_real64, size(buf_real64), fileobj%io_root, block=.true.)
-        call put_array_section(buf_real64, vdata, c, e)
-        deallocate(buf_real64)
+      type is (integer(kind=i4_kind))
+        call allocate_array(buf_i4_kind, e)
+        call mpp_recv(buf_i4_kind, size(buf_i4_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_i4_kind, vdata, c, e)
+        deallocate(buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call allocate_array(buf_i8_kind, e)
+        call mpp_recv(buf_i8_kind, size(buf_i8_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_i8_kind, vdata, c, e)
+        deallocate(buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call allocate_array(buf_r4_kind, e)
+        call mpp_recv(buf_r4_kind, size(buf_r4_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_r4_kind, vdata, c, e)
+        deallocate(buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call allocate_array(buf_r8_kind, e)
+        call mpp_recv(buf_r8_kind, size(buf_r8_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_r8_kind, vdata, c, e)
+        deallocate(buf_r8_kind)
       class default
           call error("domain_read_4d: Unsupported type for variable: "//variable_name//" in file: "//trim(fileobj%path)//"")
     end select
@@ -778,10 +778,10 @@ subroutine domain_read_5d(fileobj, variable_name, vdata, unlim_dim_level, &
   integer, dimension(:), allocatable :: pe_jcsize
   integer, dimension(5) :: c
   integer, dimension(5) :: e
-  integer(kind=int32), dimension(:,:,:,:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:,:,:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:,:,:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:,:,:,:), allocatable :: buf_real64
+  integer(kind=i4_kind), dimension(:,:,:,:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:,:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:,:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:,:,:), allocatable :: buf_r8_kind
   logical :: buffer_includes_halos
   integer :: xgmin !< Starting x index of global io domain
   integer :: ygmin !< Starting y index of global io domain
@@ -820,10 +820,10 @@ subroutine domain_read_5d(fileobj, variable_name, vdata, unlim_dim_level, &
       e(xdim_index) = pe_icsize(i)
       e(ydim_index) = pe_jcsize(i)
       select type(vdata)
-        type is (integer(kind=int32))
+        type is (integer(kind=i4_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_int32, e)
-          call netcdf_read_data(fileobj, variable_name, buf_int32, &
+          call allocate_array(buf_i4_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_i4_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -837,17 +837,17 @@ subroutine domain_read_5d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_int32, vdata, c, e)
+            call put_array_section(buf_i4_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_int32, size(buf_int32), fileobj%pelist(i))
+            call mpp_send(buf_i4_kind, size(buf_i4_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_int32)
-        type is (integer(kind=int64))
+          deallocate(buf_i4_kind)
+        type is (integer(kind=i8_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_int64, e)
-          call netcdf_read_data(fileobj, variable_name, buf_int64, &
+          call allocate_array(buf_i8_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_i8_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -860,17 +860,17 @@ subroutine domain_read_5d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_int64, vdata, c, e)
+            call put_array_section(buf_i8_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_int64, size(buf_int64), fileobj%pelist(i))
+            call mpp_send(buf_i8_kind, size(buf_i8_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_int64)
-        type is (real(kind=real32))
+          deallocate(buf_i8_kind)
+        type is (real(kind=r4_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_real32, e)
-          call netcdf_read_data(fileobj, variable_name, buf_real32, &
+          call allocate_array(buf_r4_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_r4_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -883,17 +883,17 @@ subroutine domain_read_5d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_real32, vdata, c, e)
+            call put_array_section(buf_r4_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_real32, size(buf_real32), fileobj%pelist(i))
+            call mpp_send(buf_r4_kind, size(buf_r4_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_real32)
-        type is (real(kind=real64))
+          deallocate(buf_r4_kind)
+        type is (real(kind=r8_kind))
           !Read in the data for fileobj%pelist(i)'s portion of the compute domain.
-          call allocate_array(buf_real64, e)
-          call netcdf_read_data(fileobj, variable_name, buf_real64, &
+          call allocate_array(buf_r8_kind, e)
+          call netcdf_read_data(fileobj, variable_name, buf_r8_kind, &
                                 unlim_dim_level=unlim_dim_level, &
                                 corner=c, edge_lengths=e, broadcast=.false.)
           if (i .eq. 1) then
@@ -906,13 +906,13 @@ subroutine domain_read_5d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call put_array_section(buf_real64, vdata, c, e)
+            call put_array_section(buf_r8_kind, vdata, c, e)
           else
             !Send data to non-root ranks.
-            call mpp_send(buf_real64, size(buf_real64), fileobj%pelist(i))
+            call mpp_send(buf_r8_kind, size(buf_r8_kind), fileobj%pelist(i))
             call mpp_sync_self(check=EVENT_SEND)
           endif
-          deallocate(buf_real64)
+          deallocate(buf_r8_kind)
         class default
           call error("domain_read_5d: Unsupported type for variable: "//variable_name//" in file: "//trim(fileobj%path)//"")
       end select
@@ -929,26 +929,26 @@ subroutine domain_read_5d(fileobj, variable_name, vdata, unlim_dim_level, &
     e(xdim_index) = xc_size
     e(ydim_index) = yc_size
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(buf_int32, e)
-        call mpp_recv(buf_int32, size(buf_int32), fileobj%io_root, block=.true.)
-        call put_array_section(buf_int32, vdata, c, e)
-        deallocate(buf_int32)
-      type is (integer(kind=int64))
-        call allocate_array(buf_int64, e)
-        call mpp_recv(buf_int64, size(buf_int64), fileobj%io_root, block=.true.)
-        call put_array_section(buf_int64, vdata, c, e)
-        deallocate(buf_int64)
-      type is (real(kind=real32))
-        call allocate_array(buf_real32, e)
-        call mpp_recv(buf_real32, size(buf_real32), fileobj%io_root, block=.true.)
-        call put_array_section(buf_real32, vdata, c, e)
-        deallocate(buf_real32)
-      type is (real(kind=real64))
-        call allocate_array(buf_real64, e)
-        call mpp_recv(buf_real64, size(buf_real64), fileobj%io_root, block=.true.)
-        call put_array_section(buf_real64, vdata, c, e)
-        deallocate(buf_real64)
+      type is (integer(kind=i4_kind))
+        call allocate_array(buf_i4_kind, e)
+        call mpp_recv(buf_i4_kind, size(buf_i4_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_i4_kind, vdata, c, e)
+        deallocate(buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call allocate_array(buf_i8_kind, e)
+        call mpp_recv(buf_i8_kind, size(buf_i8_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_i8_kind, vdata, c, e)
+        deallocate(buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call allocate_array(buf_r4_kind, e)
+        call mpp_recv(buf_r4_kind, size(buf_r4_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_r4_kind, vdata, c, e)
+        deallocate(buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call allocate_array(buf_r8_kind, e)
+        call mpp_recv(buf_r8_kind, size(buf_r8_kind), fileobj%io_root, block=.true.)
+        call put_array_section(buf_r8_kind, vdata, c, e)
+        deallocate(buf_r8_kind)
       class default
           call error("domain_read_5d: Unsupported type for variable: "//variable_name//" in file: "//trim(fileobj%path)//"")
     end select
diff --git a/fms2_io/include/domain_write.inc b/fms2_io/include/domain_write.inc
index a8008e291c..f6d5e21344 100644
--- a/fms2_io/include/domain_write.inc
+++ b/fms2_io/include/domain_write.inc
@@ -102,17 +102,17 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
                                                               !! will be written
                                                               !! in each dimension.
 
-  integer(kind=int32), dimension(:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:), allocatable :: buf_real64
+  integer(kind=i4_kind), dimension(:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:), allocatable :: buf_r8_kind
   logical :: buffer_includes_halos
   integer, dimension(2) :: c
   integer, dimension(2) :: e
-  integer(kind=int32), dimension(:,:), allocatable :: global_buf_int32
-  integer(kind=int64), dimension(:,:), allocatable :: global_buf_int64
-  real(kind=real32), dimension(:,:), allocatable :: global_buf_real32
-  real(kind=real64), dimension(:,:), allocatable :: global_buf_real64
+  integer(kind=i4_kind), dimension(:,:), allocatable :: global_buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:), allocatable :: global_buf_i8_kind
+  real(kind=r4_kind), dimension(:,:), allocatable :: global_buf_r4_kind
+  real(kind=r8_kind), dimension(:,:), allocatable :: global_buf_r8_kind
   integer :: i
   type(domain2d), pointer :: io_domain
   integer :: isc
@@ -131,10 +131,10 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
   integer :: ydim_index
   integer :: ypos
   integer :: yc_size
-  real(kind=int32) :: fill_int32 !< Fill value of a int32 variable
-  real(kind=int64) :: fill_int64 !< Fill value of a int64 variable
-  real(kind=real32) :: fill_real32 !< Fill value of a real32 variable
-  real(kind=real64) :: fill_real64 !< Fill value of a real64 variable
+  real(kind=i4_kind) :: fill_i4_kind !< Fill value of a i4_kind variable
+  real(kind=i8_kind) :: fill_i8_kind !< Fill value of a i8_kind variable
+  real(kind=r4_kind) :: fill_r4_kind !< Fill value of a r4_kind variable
+  real(kind=r8_kind) :: fill_r8_kind !< Fill value of a r8_kind variable
   integer :: xgmax !< Ending x index of the global io domain
   integer :: xgmin !< Starting x index of the global io domain
   integer :: ygmax !< Ending y index of the global io domain
@@ -179,29 +179,29 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
     !< Allocate a global buffer, get the fill value if it exists in the file, and initialize
     !! the buffer to the fill value
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(global_buf_int32, e)
-        global_buf_int32 = 0
-        if (get_fill_value(fileobj, variable_name, fill_int32, broadcast=.false.)) then
-            global_buf_int32 = fill_int32
+      type is (integer(kind=i4_kind))
+        call allocate_array(global_buf_i4_kind, e)
+        global_buf_i4_kind = 0
+        if (get_fill_value(fileobj, variable_name, fill_i4_kind, broadcast=.false.)) then
+            global_buf_i4_kind = fill_i4_kind
         endif
-      type is (integer(kind=int64))
-        call allocate_array(global_buf_int64, e)
-        global_buf_int64 = 0
-        if (get_fill_value(fileobj, variable_name, fill_int64, broadcast=.false.)) then
-            global_buf_int64 = fill_int64
+      type is (integer(kind=i8_kind))
+        call allocate_array(global_buf_i8_kind, e)
+        global_buf_i8_kind = 0
+        if (get_fill_value(fileobj, variable_name, fill_i8_kind, broadcast=.false.)) then
+            global_buf_i8_kind = fill_i8_kind
         endif
-      type is (real(kind=real32))
-        call allocate_array(global_buf_real32, e)
-        global_buf_real32 = 0.
-        if (get_fill_value(fileobj, variable_name, fill_real32, broadcast=.false.)) then
-            global_buf_real32 = fill_real32
+      type is (real(kind=r4_kind))
+        call allocate_array(global_buf_r4_kind, e)
+        global_buf_r4_kind = 0.
+        if (get_fill_value(fileobj, variable_name, fill_r4_kind, broadcast=.false.)) then
+            global_buf_r4_kind = fill_r4_kind
         endif
-      type is (real(kind=real64))
-        call allocate_array(global_buf_real64, e)
-        global_buf_real64 = 0.
-        if (get_fill_value(fileobj, variable_name, fill_real64, broadcast=.false.)) then
-            global_buf_real64 = fill_real64
+      type is (real(kind=r8_kind))
+        call allocate_array(global_buf_r8_kind, e)
+        global_buf_r8_kind = 0.
+        if (get_fill_value(fileobj, variable_name, fill_r8_kind, broadcast=.false.)) then
+            global_buf_r8_kind = fill_r8_kind
         endif
       class default
         call error("unsupported type.")
@@ -214,8 +214,8 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
       e(xdim_index) = pe_icsize(i)
       e(ydim_index) = pe_jcsize(i)
       select type(vdata)
-        type is (integer(kind=int32))
-          call allocate_array(buf_int32, e)
+        type is (integer(kind=i4_kind))
+          call allocate_array(buf_i4_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -227,18 +227,18 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_int32, vdata, c, e)
+            call get_array_section(buf_i4_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_int32, size(buf_int32), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_i4_kind, size(buf_i4_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_int32, global_buf_int32, c, e)
-          deallocate(buf_int32)
-        type is (integer(kind=int64))
-          call allocate_array(buf_int64, e)
+          call put_array_section(buf_i4_kind, global_buf_i4_kind, c, e)
+          deallocate(buf_i4_kind)
+        type is (integer(kind=i8_kind))
+          call allocate_array(buf_i8_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -250,18 +250,18 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_int64, vdata, c, e)
+            call get_array_section(buf_i8_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_int64, size(buf_int64), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_i8_kind, size(buf_i8_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_int64, global_buf_int64, c, e)
-          deallocate(buf_int64)
-        type is (real(kind=real32))
-          call allocate_array(buf_real32, e)
+          call put_array_section(buf_i8_kind, global_buf_i8_kind, c, e)
+          deallocate(buf_i8_kind)
+        type is (real(kind=r4_kind))
+          call allocate_array(buf_r4_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -273,18 +273,18 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_real32, vdata, c, e)
+            call get_array_section(buf_r4_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_real32, size(buf_real32), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_r4_kind, size(buf_r4_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_real32, global_buf_real32, c, e)
-          deallocate(buf_real32)
-        type is (real(kind=real64))
-          call allocate_array(buf_real64, e)
+          call put_array_section(buf_r4_kind, global_buf_r4_kind, c, e)
+          deallocate(buf_r4_kind)
+        type is (real(kind=r8_kind))
+          call allocate_array(buf_r8_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -296,16 +296,16 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_real64, vdata, c, e)
+            call get_array_section(buf_r8_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_real64, size(buf_real64), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_r8_kind, size(buf_r8_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_real64, global_buf_real64, c, e)
-          deallocate(buf_real64)
+          call put_array_section(buf_r8_kind, global_buf_r8_kind, c, e)
+          deallocate(buf_r8_kind)
       end select
     enddo
     deallocate(pe_isc)
@@ -317,22 +317,22 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
 
     !Write the out the data.
     select type(vdata)
-      type is (integer(kind=int32))
-        call netcdf_write_data(fileobj, variable_name, global_buf_int32, &
+      type is (integer(kind=i4_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_i4_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_int32)
-      type is (integer(kind=int64))
-        call netcdf_write_data(fileobj, variable_name, global_buf_int64, &
+        deallocate(global_buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_i8_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_int64)
-      type is (real(kind=real32))
-        call netcdf_write_data(fileobj, variable_name, global_buf_real32, &
+        deallocate(global_buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_r4_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_real32)
-      type is (real(kind=real64))
-        call netcdf_write_data(fileobj, variable_name, global_buf_real64, &
+        deallocate(global_buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_r8_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_real64)
+        deallocate(global_buf_r8_kind)
     end select
   else
     if (buffer_includes_halos) then
@@ -342,30 +342,30 @@ subroutine domain_write_2d(fileobj, variable_name, vdata, unlim_dim_level, &
     e(xdim_index) = xc_size
     e(ydim_index) = yc_size
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(buf_int32, e)
-        call get_array_section(buf_int32, vdata, c, e)
-        call mpp_send(buf_int32, size(buf_int32), fileobj%io_root)
+      type is (integer(kind=i4_kind))
+        call allocate_array(buf_i4_kind, e)
+        call get_array_section(buf_i4_kind, vdata, c, e)
+        call mpp_send(buf_i4_kind, size(buf_i4_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_int32)
-      type is (integer(kind=int64))
-        call allocate_array(buf_int64, e)
-        call get_array_section(buf_int64, vdata, c, e)
-        call mpp_send(buf_int64, size(buf_int64), fileobj%io_root)
+        deallocate(buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call allocate_array(buf_i8_kind, e)
+        call get_array_section(buf_i8_kind, vdata, c, e)
+        call mpp_send(buf_i8_kind, size(buf_i8_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_int64)
-      type is (real(kind=real32))
-        call allocate_array(buf_real32, e)
-        call get_array_section(buf_real32, vdata, c, e)
-        call mpp_send(buf_real32, size(buf_real32), fileobj%io_root)
+        deallocate(buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call allocate_array(buf_r4_kind, e)
+        call get_array_section(buf_r4_kind, vdata, c, e)
+        call mpp_send(buf_r4_kind, size(buf_r4_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_real32)
-      type is (real(kind=real64))
-        call allocate_array(buf_real64, e)
-        call get_array_section(buf_real64, vdata, c, e)
-        call mpp_send(buf_real64, size(buf_real64), fileobj%io_root)
+        deallocate(buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call allocate_array(buf_r8_kind, e)
+        call get_array_section(buf_r8_kind, vdata, c, e)
+        call mpp_send(buf_r8_kind, size(buf_r8_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_real64)
+        deallocate(buf_r8_kind)
       class default
         call error("unsupported type.")
     end select
@@ -396,17 +396,17 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
                                                               !! will be written
                                                               !! in each dimension.
 
-  integer(kind=int32), dimension(:,:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:,:), allocatable :: buf_real64
+  integer(kind=i4_kind), dimension(:,:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:), allocatable :: buf_r8_kind
   logical :: buffer_includes_halos
   integer, dimension(3) :: c
   integer, dimension(3) :: e
-  integer(kind=int32), dimension(:,:,:), allocatable :: global_buf_int32
-  integer(kind=int64), dimension(:,:,:), allocatable :: global_buf_int64
-  real(kind=real32), dimension(:,:,:), allocatable :: global_buf_real32
-  real(kind=real64), dimension(:,:,:), allocatable :: global_buf_real64
+  integer(kind=i4_kind), dimension(:,:,:), allocatable :: global_buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:), allocatable :: global_buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:), allocatable :: global_buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:), allocatable :: global_buf_r8_kind
   integer :: i
   type(domain2d), pointer :: io_domain
   integer :: isc
@@ -425,10 +425,10 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
   integer :: ydim_index
   integer :: ypos
   integer :: yc_size
-  real(kind=int32) :: fill_int32 !< Fill value of a int32 variable
-  real(kind=int64) :: fill_int64 !< Fill value of a int64 variable
-  real(kind=real32) :: fill_real32 !< Fill value of a real32 variable
-  real(kind=real64) :: fill_real64 !< Fill value of a real64 variable
+  real(kind=i4_kind) :: fill_i4_kind !< Fill value of a i4_kind variable
+  real(kind=i8_kind) :: fill_i8_kind !< Fill value of a i8_kind variable
+  real(kind=r4_kind) :: fill_r4_kind !< Fill value of a r4_kind variable
+  real(kind=r8_kind) :: fill_r8_kind !< Fill value of a r8_kind variable
   integer :: xgmax !< Ending x index of the global io domain
   integer :: xgmin !< Starting x index of the global io domain
   integer :: ygmax !< Ending y index of the global io domain
@@ -473,29 +473,29 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
     !< Allocate a global buffer, get the fill value if it exists in the file, and initialize
     !! the buffer to the fill value
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(global_buf_int32, e)
-        global_buf_int32 = 0
-        if (get_fill_value(fileobj, variable_name, fill_int32, broadcast=.false.)) then
-            global_buf_int32 = fill_int32
+      type is (integer(kind=i4_kind))
+        call allocate_array(global_buf_i4_kind, e)
+        global_buf_i4_kind = 0
+        if (get_fill_value(fileobj, variable_name, fill_i4_kind, broadcast=.false.)) then
+            global_buf_i4_kind = fill_i4_kind
         endif
-      type is (integer(kind=int64))
-        call allocate_array(global_buf_int64, e)
-        global_buf_int64 = 0
-        if (get_fill_value(fileobj, variable_name, fill_int64, broadcast=.false.)) then
-            global_buf_int64 = fill_int64
+      type is (integer(kind=i8_kind))
+        call allocate_array(global_buf_i8_kind, e)
+        global_buf_i8_kind = 0
+        if (get_fill_value(fileobj, variable_name, fill_i8_kind, broadcast=.false.)) then
+            global_buf_i8_kind = fill_i8_kind
         endif
-      type is (real(kind=real32))
-        call allocate_array(global_buf_real32, e)
-        global_buf_real32 = 0.
-        if (get_fill_value(fileobj, variable_name, fill_real32, broadcast=.false.)) then
-            global_buf_real32 = fill_real32
+      type is (real(kind=r4_kind))
+        call allocate_array(global_buf_r4_kind, e)
+        global_buf_r4_kind = 0.
+        if (get_fill_value(fileobj, variable_name, fill_r4_kind, broadcast=.false.)) then
+            global_buf_r4_kind = fill_r4_kind
         endif
-      type is (real(kind=real64))
-        call allocate_array(global_buf_real64, e)
-        global_buf_real64 = 0.
-        if (get_fill_value(fileobj, variable_name, fill_real64, broadcast=.false.)) then
-            global_buf_real64 = fill_real64
+      type is (real(kind=r8_kind))
+        call allocate_array(global_buf_r8_kind, e)
+        global_buf_r8_kind = 0.
+        if (get_fill_value(fileobj, variable_name, fill_r8_kind, broadcast=.false.)) then
+            global_buf_r8_kind = fill_r8_kind
         endif
       class default
         call error("unsupported type.")
@@ -508,8 +508,8 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
       e(xdim_index) = pe_icsize(i)
       e(ydim_index) = pe_jcsize(i)
       select type(vdata)
-        type is (integer(kind=int32))
-          call allocate_array(buf_int32, e)
+        type is (integer(kind=i4_kind))
+          call allocate_array(buf_i4_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -521,18 +521,18 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_int32, vdata, c, e)
+            call get_array_section(buf_i4_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_int32, size(buf_int32), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_i4_kind, size(buf_i4_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_int32, global_buf_int32, c, e)
-          deallocate(buf_int32)
-        type is (integer(kind=int64))
-          call allocate_array(buf_int64, e)
+          call put_array_section(buf_i4_kind, global_buf_i4_kind, c, e)
+          deallocate(buf_i4_kind)
+        type is (integer(kind=i8_kind))
+          call allocate_array(buf_i8_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -544,18 +544,18 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_int64, vdata, c, e)
+            call get_array_section(buf_i8_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_int64, size(buf_int64), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_i8_kind, size(buf_i8_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_int64, global_buf_int64, c, e)
-          deallocate(buf_int64)
-        type is (real(kind=real32))
-          call allocate_array(buf_real32, e)
+          call put_array_section(buf_i8_kind, global_buf_i8_kind, c, e)
+          deallocate(buf_i8_kind)
+        type is (real(kind=r4_kind))
+          call allocate_array(buf_r4_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -567,18 +567,18 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_real32, vdata, c, e)
+            call get_array_section(buf_r4_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_real32, size(buf_real32), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_r4_kind, size(buf_r4_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_real32, global_buf_real32, c, e)
-          deallocate(buf_real32)
-        type is (real(kind=real64))
-          call allocate_array(buf_real64, e)
+          call put_array_section(buf_r4_kind, global_buf_r4_kind, c, e)
+          deallocate(buf_r4_kind)
+        type is (real(kind=r8_kind))
+          call allocate_array(buf_r8_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -590,16 +590,16 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_real64, vdata, c, e)
+            call get_array_section(buf_r8_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_real64, size(buf_real64), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_r8_kind, size(buf_r8_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_real64, global_buf_real64, c, e)
-          deallocate(buf_real64)
+          call put_array_section(buf_r8_kind, global_buf_r8_kind, c, e)
+          deallocate(buf_r8_kind)
       end select
     enddo
     deallocate(pe_isc)
@@ -611,22 +611,22 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
 
     !Write the out the data.
     select type(vdata)
-      type is (integer(kind=int32))
-        call netcdf_write_data(fileobj, variable_name, global_buf_int32, &
+      type is (integer(kind=i4_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_i4_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_int32)
-      type is (integer(kind=int64))
-        call netcdf_write_data(fileobj, variable_name, global_buf_int64, &
+        deallocate(global_buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_i8_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_int64)
-      type is (real(kind=real32))
-        call netcdf_write_data(fileobj, variable_name, global_buf_real32, &
+        deallocate(global_buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_r4_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_real32)
-      type is (real(kind=real64))
-        call netcdf_write_data(fileobj, variable_name, global_buf_real64, &
+        deallocate(global_buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_r8_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_real64)
+        deallocate(global_buf_r8_kind)
     end select
   else
     if (buffer_includes_halos) then
@@ -636,30 +636,30 @@ subroutine domain_write_3d(fileobj, variable_name, vdata, unlim_dim_level, &
     e(xdim_index) = xc_size
     e(ydim_index) = yc_size
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(buf_int32, e)
-        call get_array_section(buf_int32, vdata, c, e)
-        call mpp_send(buf_int32, size(buf_int32), fileobj%io_root)
+      type is (integer(kind=i4_kind))
+        call allocate_array(buf_i4_kind, e)
+        call get_array_section(buf_i4_kind, vdata, c, e)
+        call mpp_send(buf_i4_kind, size(buf_i4_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_int32)
-      type is (integer(kind=int64))
-        call allocate_array(buf_int64, e)
-        call get_array_section(buf_int64, vdata, c, e)
-        call mpp_send(buf_int64, size(buf_int64), fileobj%io_root)
+        deallocate(buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call allocate_array(buf_i8_kind, e)
+        call get_array_section(buf_i8_kind, vdata, c, e)
+        call mpp_send(buf_i8_kind, size(buf_i8_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_int64)
-      type is (real(kind=real32))
-        call allocate_array(buf_real32, e)
-        call get_array_section(buf_real32, vdata, c, e)
-        call mpp_send(buf_real32, size(buf_real32), fileobj%io_root)
+        deallocate(buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call allocate_array(buf_r4_kind, e)
+        call get_array_section(buf_r4_kind, vdata, c, e)
+        call mpp_send(buf_r4_kind, size(buf_r4_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_real32)
-      type is (real(kind=real64))
-        call allocate_array(buf_real64, e)
-        call get_array_section(buf_real64, vdata, c, e)
-        call mpp_send(buf_real64, size(buf_real64), fileobj%io_root)
+        deallocate(buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call allocate_array(buf_r8_kind, e)
+        call get_array_section(buf_r8_kind, vdata, c, e)
+        call mpp_send(buf_r8_kind, size(buf_r8_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_real64)
+        deallocate(buf_r8_kind)
       class default
         call error("unsupported type.")
     end select
@@ -690,17 +690,17 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
                                                               !! will be written
                                                               !! in each dimension.
 
-  integer(kind=int32), dimension(:,:,:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:,:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:,:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:,:,:), allocatable :: buf_real64
+  integer(kind=i4_kind), dimension(:,:,:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:,:), allocatable :: buf_r8_kind
   logical :: buffer_includes_halos
   integer, dimension(4) :: c
   integer, dimension(4) :: e
-  integer(kind=int32), dimension(:,:,:,:), allocatable :: global_buf_int32
-  integer(kind=int64), dimension(:,:,:,:), allocatable :: global_buf_int64
-  real(kind=real32), dimension(:,:,:,:), allocatable :: global_buf_real32
-  real(kind=real64), dimension(:,:,:,:), allocatable :: global_buf_real64
+  integer(kind=i4_kind), dimension(:,:,:,:), allocatable :: global_buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:,:), allocatable :: global_buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:,:), allocatable :: global_buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:,:), allocatable :: global_buf_r8_kind
   integer :: i
   type(domain2d), pointer :: io_domain
   integer :: isc
@@ -719,10 +719,10 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
   integer :: ydim_index
   integer :: ypos
   integer :: yc_size
-  real(kind=int32) :: fill_int32 !< Fill value of a int32 variable
-  real(kind=int64) :: fill_int64 !< Fill value of a int64 variable
-  real(kind=real32) :: fill_real32 !< Fill value of a real32 variable
-  real(kind=real64) :: fill_real64 !< Fill value of a real64 variable
+  real(kind=i4_kind) :: fill_i4_kind !< Fill value of a i4_kind variable
+  real(kind=i8_kind) :: fill_i8_kind !< Fill value of a i8_kind variable
+  real(kind=r4_kind) :: fill_r4_kind !< Fill value of a r4_kind variable
+  real(kind=r8_kind) :: fill_r8_kind !< Fill value of a r8_kind variable
   integer :: xgmax !< Ending x index of the global io domain
   integer :: xgmin !< Starting x index of the global io domain
   integer :: ygmax !< Ending y index of the global io domain
@@ -767,29 +767,29 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
     !< Allocate a global buffer, get the fill value if it exists in the file, and initialize
     !! the buffer to the fill value
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(global_buf_int32, e)
-        global_buf_int32 = 0
-        if (get_fill_value(fileobj, variable_name, fill_int32, broadcast=.false.)) then
-            global_buf_int32 = fill_int32
+      type is (integer(kind=i4_kind))
+        call allocate_array(global_buf_i4_kind, e)
+        global_buf_i4_kind = 0
+        if (get_fill_value(fileobj, variable_name, fill_i4_kind, broadcast=.false.)) then
+            global_buf_i4_kind = fill_i4_kind
         endif
-      type is (integer(kind=int64))
-        call allocate_array(global_buf_int64, e)
-        global_buf_int64 = 0
-        if (get_fill_value(fileobj, variable_name, fill_int64, broadcast=.false.)) then
-            global_buf_int64 = fill_int64
+      type is (integer(kind=i8_kind))
+        call allocate_array(global_buf_i8_kind, e)
+        global_buf_i8_kind = 0
+        if (get_fill_value(fileobj, variable_name, fill_i8_kind, broadcast=.false.)) then
+            global_buf_i8_kind = fill_i8_kind
         endif
-      type is (real(kind=real32))
-        call allocate_array(global_buf_real32, e)
-        global_buf_real32 = 0.
-        if (get_fill_value(fileobj, variable_name, fill_real32, broadcast=.false.)) then
-            global_buf_real32 = fill_real32
+      type is (real(kind=r4_kind))
+        call allocate_array(global_buf_r4_kind, e)
+        global_buf_r4_kind = 0.
+        if (get_fill_value(fileobj, variable_name, fill_r4_kind, broadcast=.false.)) then
+            global_buf_r4_kind = fill_r4_kind
         endif
-      type is (real(kind=real64))
-        call allocate_array(global_buf_real64, e)
-        global_buf_real64 = 0.
-        if (get_fill_value(fileobj, variable_name, fill_real64, broadcast=.false.)) then
-            global_buf_real64 = fill_real64
+      type is (real(kind=r8_kind))
+        call allocate_array(global_buf_r8_kind, e)
+        global_buf_r8_kind = 0.
+        if (get_fill_value(fileobj, variable_name, fill_r8_kind, broadcast=.false.)) then
+            global_buf_r8_kind = fill_r8_kind
         endif
       class default
         call error("unsupported type.")
@@ -802,8 +802,8 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
       e(xdim_index) = pe_icsize(i)
       e(ydim_index) = pe_jcsize(i)
       select type(vdata)
-        type is (integer(kind=int32))
-          call allocate_array(buf_int32, e)
+        type is (integer(kind=i4_kind))
+          call allocate_array(buf_i4_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -815,18 +815,18 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_int32, vdata, c, e)
+            call get_array_section(buf_i4_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_int32, size(buf_int32), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_i4_kind, size(buf_i4_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_int32, global_buf_int32, c, e)
-          deallocate(buf_int32)
-        type is (integer(kind=int64))
-          call allocate_array(buf_int64, e)
+          call put_array_section(buf_i4_kind, global_buf_i4_kind, c, e)
+          deallocate(buf_i4_kind)
+        type is (integer(kind=i8_kind))
+          call allocate_array(buf_i8_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -838,18 +838,18 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_int64, vdata, c, e)
+            call get_array_section(buf_i8_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_int64, size(buf_int64), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_i8_kind, size(buf_i8_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_int64, global_buf_int64, c, e)
-          deallocate(buf_int64)
-        type is (real(kind=real32))
-          call allocate_array(buf_real32, e)
+          call put_array_section(buf_i8_kind, global_buf_i8_kind, c, e)
+          deallocate(buf_i8_kind)
+        type is (real(kind=r4_kind))
+          call allocate_array(buf_r4_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -861,18 +861,18 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_real32, vdata, c, e)
+            call get_array_section(buf_r4_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_real32, size(buf_real32), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_r4_kind, size(buf_r4_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_real32, global_buf_real32, c, e)
-          deallocate(buf_real32)
-        type is (real(kind=real64))
-          call allocate_array(buf_real64, e)
+          call put_array_section(buf_r4_kind, global_buf_r4_kind, c, e)
+          deallocate(buf_r4_kind)
+        type is (real(kind=r8_kind))
+          call allocate_array(buf_r8_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -884,16 +884,16 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_real64, vdata, c, e)
+            call get_array_section(buf_r8_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_real64, size(buf_real64), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_r8_kind, size(buf_r8_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_real64, global_buf_real64, c, e)
-          deallocate(buf_real64)
+          call put_array_section(buf_r8_kind, global_buf_r8_kind, c, e)
+          deallocate(buf_r8_kind)
       end select
     enddo
     deallocate(pe_isc)
@@ -905,22 +905,22 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
 
     !Write the out the data.
     select type(vdata)
-      type is (integer(kind=int32))
-        call netcdf_write_data(fileobj, variable_name, global_buf_int32, &
+      type is (integer(kind=i4_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_i4_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_int32)
-      type is (integer(kind=int64))
-        call netcdf_write_data(fileobj, variable_name, global_buf_int64, &
+        deallocate(global_buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_i8_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_int64)
-      type is (real(kind=real32))
-        call netcdf_write_data(fileobj, variable_name, global_buf_real32, &
+        deallocate(global_buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_r4_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_real32)
-      type is (real(kind=real64))
-        call netcdf_write_data(fileobj, variable_name, global_buf_real64, &
+        deallocate(global_buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_r8_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_real64)
+        deallocate(global_buf_r8_kind)
     end select
   else
     if (buffer_includes_halos) then
@@ -930,30 +930,30 @@ subroutine domain_write_4d(fileobj, variable_name, vdata, unlim_dim_level, &
     e(xdim_index) = xc_size
     e(ydim_index) = yc_size
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(buf_int32, e)
-        call get_array_section(buf_int32, vdata, c, e)
-        call mpp_send(buf_int32, size(buf_int32), fileobj%io_root)
+      type is (integer(kind=i4_kind))
+        call allocate_array(buf_i4_kind, e)
+        call get_array_section(buf_i4_kind, vdata, c, e)
+        call mpp_send(buf_i4_kind, size(buf_i4_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_int32)
-      type is (integer(kind=int64))
-        call allocate_array(buf_int64, e)
-        call get_array_section(buf_int64, vdata, c, e)
-        call mpp_send(buf_int64, size(buf_int64), fileobj%io_root)
+        deallocate(buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call allocate_array(buf_i8_kind, e)
+        call get_array_section(buf_i8_kind, vdata, c, e)
+        call mpp_send(buf_i8_kind, size(buf_i8_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_int64)
-      type is (real(kind=real32))
-        call allocate_array(buf_real32, e)
-        call get_array_section(buf_real32, vdata, c, e)
-        call mpp_send(buf_real32, size(buf_real32), fileobj%io_root)
+        deallocate(buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call allocate_array(buf_r4_kind, e)
+        call get_array_section(buf_r4_kind, vdata, c, e)
+        call mpp_send(buf_r4_kind, size(buf_r4_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_real32)
-      type is (real(kind=real64))
-        call allocate_array(buf_real64, e)
-        call get_array_section(buf_real64, vdata, c, e)
-        call mpp_send(buf_real64, size(buf_real64), fileobj%io_root)
+        deallocate(buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call allocate_array(buf_r8_kind, e)
+        call get_array_section(buf_r8_kind, vdata, c, e)
+        call mpp_send(buf_r8_kind, size(buf_r8_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_real64)
+        deallocate(buf_r8_kind)
       class default
         call error("unsupported type.")
     end select
@@ -984,17 +984,17 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
                                                               !! will be written
                                                               !! in each dimension.
 
-  integer(kind=int32), dimension(:,:,:,:,:), allocatable :: buf_int32
-  integer(kind=int64), dimension(:,:,:,:,:), allocatable :: buf_int64
-  real(kind=real32), dimension(:,:,:,:,:), allocatable :: buf_real32
-  real(kind=real64), dimension(:,:,:,:,:), allocatable :: buf_real64
+  integer(kind=i4_kind), dimension(:,:,:,:,:), allocatable :: buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:,:,:), allocatable :: buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:,:,:), allocatable :: buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:,:,:), allocatable :: buf_r8_kind
   logical :: buffer_includes_halos
   integer, dimension(5) :: c
   integer, dimension(5) :: e
-  integer(kind=int32), dimension(:,:,:,:,:), allocatable :: global_buf_int32
-  integer(kind=int64), dimension(:,:,:,:,:), allocatable :: global_buf_int64
-  real(kind=real32), dimension(:,:,:,:,:), allocatable :: global_buf_real32
-  real(kind=real64), dimension(:,:,:,:,:), allocatable :: global_buf_real64
+  integer(kind=i4_kind), dimension(:,:,:,:,:), allocatable :: global_buf_i4_kind
+  integer(kind=i8_kind), dimension(:,:,:,:,:), allocatable :: global_buf_i8_kind
+  real(kind=r4_kind), dimension(:,:,:,:,:), allocatable :: global_buf_r4_kind
+  real(kind=r8_kind), dimension(:,:,:,:,:), allocatable :: global_buf_r8_kind
   integer :: i
   type(domain2d), pointer :: io_domain
   integer :: isc
@@ -1013,10 +1013,10 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
   integer :: ydim_index
   integer :: ypos
   integer :: yc_size
-  real(kind=int32) :: fill_int32 !< Fill value of a int32 variable
-  real(kind=int64) :: fill_int64 !< Fill value of a int64 variable
-  real(kind=real32) :: fill_real32 !< Fill value of a real32 variable
-  real(kind=real64) :: fill_real64 !< Fill value of a real64 variable
+  real(kind=i4_kind) :: fill_i4_kind !< Fill value of a i4_kind variable
+  real(kind=i8_kind) :: fill_i8_kind !< Fill value of a i8_kind variable
+  real(kind=r4_kind) :: fill_r4_kind !< Fill value of a r4_kind variable
+  real(kind=r8_kind) :: fill_r8_kind !< Fill value of a r8_kind variable
   integer :: xgmax !< Ending x index of the global io domain
   integer :: xgmin !< Starting x index of the global io domain
   integer :: ygmax !< Ending y index of the global io domain
@@ -1061,29 +1061,29 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
     !< Allocate a global buffer, get the fill value if it exists in the file, and initialize
     !! the buffer to the fill value
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(global_buf_int32, e)
-        global_buf_int32 = 0
-        if (get_fill_value(fileobj, variable_name, fill_int32, broadcast=.false.)) then
-            global_buf_int32 = fill_int32
+      type is (integer(kind=i4_kind))
+        call allocate_array(global_buf_i4_kind, e)
+        global_buf_i4_kind = 0
+        if (get_fill_value(fileobj, variable_name, fill_i4_kind, broadcast=.false.)) then
+            global_buf_i4_kind = fill_i4_kind
         endif
-      type is (integer(kind=int64))
-        call allocate_array(global_buf_int64, e)
-        global_buf_int64 = 0
-        if (get_fill_value(fileobj, variable_name, fill_int64, broadcast=.false.)) then
-            global_buf_int64 = fill_int64
+      type is (integer(kind=i8_kind))
+        call allocate_array(global_buf_i8_kind, e)
+        global_buf_i8_kind = 0
+        if (get_fill_value(fileobj, variable_name, fill_i8_kind, broadcast=.false.)) then
+            global_buf_i8_kind = fill_i8_kind
         endif
-      type is (real(kind=real32))
-        call allocate_array(global_buf_real32, e)
-        global_buf_real32 = 0.
-        if (get_fill_value(fileobj, variable_name, fill_real32, broadcast=.false.)) then
-            global_buf_real32 = fill_real32
+      type is (real(kind=r4_kind))
+        call allocate_array(global_buf_r4_kind, e)
+        global_buf_r4_kind = 0.
+        if (get_fill_value(fileobj, variable_name, fill_r4_kind, broadcast=.false.)) then
+            global_buf_r4_kind = fill_r4_kind
         endif
-      type is (real(kind=real64))
-        call allocate_array(global_buf_real64, e)
-        global_buf_real64 = 0.
-        if (get_fill_value(fileobj, variable_name, fill_real64, broadcast=.false.)) then
-            global_buf_real64 = fill_real64
+      type is (real(kind=r8_kind))
+        call allocate_array(global_buf_r8_kind, e)
+        global_buf_r8_kind = 0.
+        if (get_fill_value(fileobj, variable_name, fill_r8_kind, broadcast=.false.)) then
+            global_buf_r8_kind = fill_r8_kind
         endif
       class default
         call error("unsupported type.")
@@ -1096,8 +1096,8 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
       e(xdim_index) = pe_icsize(i)
       e(ydim_index) = pe_jcsize(i)
       select type(vdata)
-        type is (integer(kind=int32))
-          call allocate_array(buf_int32, e)
+        type is (integer(kind=i4_kind))
+          call allocate_array(buf_i4_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -1109,18 +1109,18 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_int32, vdata, c, e)
+            call get_array_section(buf_i4_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_int32, size(buf_int32), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_i4_kind, size(buf_i4_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_int32, global_buf_int32, c, e)
-          deallocate(buf_int32)
-        type is (integer(kind=int64))
-          call allocate_array(buf_int64, e)
+          call put_array_section(buf_i4_kind, global_buf_i4_kind, c, e)
+          deallocate(buf_i4_kind)
+        type is (integer(kind=i8_kind))
+          call allocate_array(buf_i8_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -1132,18 +1132,18 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_int64, vdata, c, e)
+            call get_array_section(buf_i8_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_int64, size(buf_int64), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_i8_kind, size(buf_i8_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_int64, global_buf_int64, c, e)
-          deallocate(buf_int64)
-        type is (real(kind=real32))
-          call allocate_array(buf_real32, e)
+          call put_array_section(buf_i8_kind, global_buf_i8_kind, c, e)
+          deallocate(buf_i8_kind)
+        type is (real(kind=r4_kind))
+          call allocate_array(buf_r4_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -1155,18 +1155,18 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_real32, vdata, c, e)
+            call get_array_section(buf_r4_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_real32, size(buf_real32), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_r4_kind, size(buf_r4_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_real32, global_buf_real32, c, e)
-          deallocate(buf_real32)
-        type is (real(kind=real64))
-          call allocate_array(buf_real64, e)
+          call put_array_section(buf_r4_kind, global_buf_r4_kind, c, e)
+          deallocate(buf_r4_kind)
+        type is (real(kind=r8_kind))
+          call allocate_array(buf_r8_kind, e)
           !Get the data for fileobj%pelist(i)'s portion of the compute domain.
           if (i .eq. 1) then
             !Root rank gets the data directly.
@@ -1178,16 +1178,16 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
               c(xdim_index) = 1
               c(ydim_index) = 1
             endif
-            call get_array_section(buf_real64, vdata, c, e)
+            call get_array_section(buf_r8_kind, vdata, c, e)
             c(xdim_index) = pe_isc(i) - xgmin + 1
             c(ydim_index) = pe_jsc(i) - ygmin + 1
           else
             !Receive data from non-root ranks.
-            call mpp_recv(buf_real64, size(buf_real64), fileobj%pelist(i), block=.true.)
+            call mpp_recv(buf_r8_kind, size(buf_r8_kind), fileobj%pelist(i), block=.true.)
           endif
           !Put local data into the global buffer.
-          call put_array_section(buf_real64, global_buf_real64, c, e)
-          deallocate(buf_real64)
+          call put_array_section(buf_r8_kind, global_buf_r8_kind, c, e)
+          deallocate(buf_r8_kind)
       end select
     enddo
     deallocate(pe_isc)
@@ -1199,22 +1199,22 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
 
     !Write the out the data.
     select type(vdata)
-      type is (integer(kind=int32))
-        call netcdf_write_data(fileobj, variable_name, global_buf_int32, &
+      type is (integer(kind=i4_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_i4_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_int32)
-      type is (integer(kind=int64))
-        call netcdf_write_data(fileobj, variable_name, global_buf_int64, &
+        deallocate(global_buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_i8_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_int64)
-      type is (real(kind=real32))
-        call netcdf_write_data(fileobj, variable_name, global_buf_real32, &
+        deallocate(global_buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_r4_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_real32)
-      type is (real(kind=real64))
-        call netcdf_write_data(fileobj, variable_name, global_buf_real64, &
+        deallocate(global_buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call netcdf_write_data(fileobj, variable_name, global_buf_r8_kind, &
                                unlim_dim_level=unlim_dim_level)
-        deallocate(global_buf_real64)
+        deallocate(global_buf_r8_kind)
     end select
   else
     if (buffer_includes_halos) then
@@ -1224,30 +1224,30 @@ subroutine domain_write_5d(fileobj, variable_name, vdata, unlim_dim_level, &
     e(xdim_index) = xc_size
     e(ydim_index) = yc_size
     select type(vdata)
-      type is (integer(kind=int32))
-        call allocate_array(buf_int32, e)
-        call get_array_section(buf_int32, vdata, c, e)
-        call mpp_send(buf_int32, size(buf_int32), fileobj%io_root)
+      type is (integer(kind=i4_kind))
+        call allocate_array(buf_i4_kind, e)
+        call get_array_section(buf_i4_kind, vdata, c, e)
+        call mpp_send(buf_i4_kind, size(buf_i4_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_int32)
-      type is (integer(kind=int64))
-        call allocate_array(buf_int64, e)
-        call get_array_section(buf_int64, vdata, c, e)
-        call mpp_send(buf_int64, size(buf_int64), fileobj%io_root)
+        deallocate(buf_i4_kind)
+      type is (integer(kind=i8_kind))
+        call allocate_array(buf_i8_kind, e)
+        call get_array_section(buf_i8_kind, vdata, c, e)
+        call mpp_send(buf_i8_kind, size(buf_i8_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_int64)
-      type is (real(kind=real32))
-        call allocate_array(buf_real32, e)
-        call get_array_section(buf_real32, vdata, c, e)
-        call mpp_send(buf_real32, size(buf_real32), fileobj%io_root)
+        deallocate(buf_i8_kind)
+      type is (real(kind=r4_kind))
+        call allocate_array(buf_r4_kind, e)
+        call get_array_section(buf_r4_kind, vdata, c, e)
+        call mpp_send(buf_r4_kind, size(buf_r4_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_real32)
-      type is (real(kind=real64))
-        call allocate_array(buf_real64, e)
-        call get_array_section(buf_real64, vdata, c, e)
-        call mpp_send(buf_real64, size(buf_real64), fileobj%io_root)
+        deallocate(buf_r4_kind)
+      type is (real(kind=r8_kind))
+        call allocate_array(buf_r8_kind, e)
+        call get_array_section(buf_r8_kind, vdata, c, e)
+        call mpp_send(buf_r8_kind, size(buf_r8_kind), fileobj%io_root)
         call mpp_sync_self(check=event_send)
-        deallocate(buf_real64)
+        deallocate(buf_r8_kind)
       class default
         call error("unsupported type.")
     end select
diff --git a/fms2_io/include/get_checksum.inc b/fms2_io/include/get_checksum.inc
index e000cdf2e5..8143859e96 100644
--- a/fms2_io/include/get_checksum.inc
+++ b/fms2_io/include/get_checksum.inc
@@ -29,13 +29,13 @@ function get_checksum_0d(data) result(chksum)
   myrank(1) = mpp_pe()
   chksum = ""
   select type(data)
-    type is (integer(int32))
+    type is (integer(i4_kind))
 !     write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (integer(int64))
+    type is (integer(i8_kind))
 !     write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real32))
+    type is (real(r4_kind))
 !     write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real64))
+    type is (real(r8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
   end select
 
@@ -52,13 +52,13 @@ function get_checksum_1d(data) result(chksum)
   myrank(1) = mpp_pe()
   chksum = ""
   select type(data)
-    type is (integer(int32))
+    type is (integer(i4_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (integer(int64))
+    type is (integer(i8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real32))
+    type is (real(r4_kind))
 !     write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real64))
+    type is (real(r8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
   end select
 
@@ -75,13 +75,13 @@ function get_checksum_2d(data) result(chksum)
   myrank(1) = mpp_pe()
   chksum = ""
   select type(data)
-    type is (integer(int32))
+    type is (integer(i4_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (integer(int64))
+    type is (integer(i8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real32))
+    type is (real(r4_kind))
 !     write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real64))
+    type is (real(r8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
   end select
 
@@ -98,13 +98,13 @@ function get_checksum_3d(data) result(chksum)
   myrank(1) = mpp_pe()
   chksum = ""
   select type(data)
-    type is (integer(int32))
+    type is (integer(i4_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (integer(int64))
+    type is (integer(i8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real32))
+    type is (real(r4_kind))
 !     write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real64))
+    type is (real(r8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
   end select
 
@@ -121,13 +121,13 @@ function get_checksum_4d(data) result(chksum)
   myrank(1) = mpp_pe()
   chksum = ""
   select type(data)
-    type is (integer(int32))
+    type is (integer(i4_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (integer(int64))
+    type is (integer(i8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real32))
+    type is (real(r4_kind))
 !     write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real64))
+    type is (real(r8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
   end select
 
@@ -144,13 +144,13 @@ function get_checksum_5d(data) result(chksum)
   myrank(1) = mpp_pe()
   chksum = ""
   select type(data)
-    type is (integer(int32))
+    type is (integer(i4_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (integer(int64))
+    type is (integer(i8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real32))
+    type is (real(r4_kind))
 !     write(chksum, "(Z16)") mpp_chksum(data, myrank)
-    type is (real(real64))
+    type is (real(r8_kind))
       write(chksum, "(Z16)") mpp_chksum(data, myrank)
   end select
 
diff --git a/fms2_io/include/get_data_type_string.inc b/fms2_io/include/get_data_type_string.inc
index 1948f88de2..dbc6f5e2a5 100644
--- a/fms2_io/include/get_data_type_string.inc
+++ b/fms2_io/include/get_data_type_string.inc
@@ -27,13 +27,13 @@ subroutine get_data_type_string_0d(sdata, type_string)
   character(len=*), intent(inout) :: type_string !> Data type.
 
   select type(sdata)
-    type is (integer(kind=int32))
+    type is (integer(kind=i4_kind))
       call string_copy(type_string, "int")
-    type is (integer(kind=int64))
-      call string_copy(type_string, "int64")
-    type is (real(kind=real32))
+    type is (integer(kind=i8_kind))
+      call string_copy(type_string, "i8_kind")
+    type is (real(kind=r4_kind))
       call string_copy(type_string, "float")
-    type is (real(kind=real64))
+    type is (real(kind=r8_kind))
       call string_copy(type_string, "double")
     type is (character(len=*))
       call string_copy(type_string, "char")
@@ -51,13 +51,13 @@ subroutine get_data_type_string_1d(sdata, type_string)
   character(len=*), intent(inout) :: type_string !> Data type.
 
   select type(sdata)
-    type is (integer(kind=int32))
+    type is (integer(kind=i4_kind))
       call string_copy(type_string, "int")
-    type is (integer(kind=int64))
-      call string_copy(type_string, "int64")
-    type is (real(kind=real32))
+    type is (integer(kind=i8_kind))
+      call string_copy(type_string, "i8_kind")
+    type is (real(kind=r4_kind))
       call string_copy(type_string, "float")
-    type is (real(kind=real64))
+    type is (real(kind=r8_kind))
       call string_copy(type_string, "double")
     type is (character(len=*))
       call string_copy(type_string, "char")
@@ -75,13 +75,13 @@ subroutine get_data_type_string_2d(sdata, type_string)
   character(len=*), intent(inout) :: type_string !> Data type.
 
   select type(sdata)
-    type is (integer(kind=int32))
+    type is (integer(kind=i4_kind))
       call string_copy(type_string, "int")
-    type is (integer(kind=int64))
-      call string_copy(type_string, "int64")
-    type is (real(kind=real32))
+    type is (integer(kind=i8_kind))
+      call string_copy(type_string, "i8_kind")
+    type is (real(kind=r4_kind))
       call string_copy(type_string, "float")
-    type is (real(kind=real64))
+    type is (real(kind=r8_kind))
       call string_copy(type_string, "double")
     type is (character(len=*))
       call string_copy(type_string, "char")
@@ -99,13 +99,13 @@ subroutine get_data_type_string_3d(sdata, type_string)
   character(len=*), intent(inout) :: type_string !> Data type.
 
   select type(sdata)
-    type is (integer(kind=int32))
+    type is (integer(kind=i4_kind))
       call string_copy(type_string, "int")
-    type is (integer(kind=int64))
-      call string_copy(type_string, "int64")
-    type is (real(kind=real32))
+    type is (integer(kind=i8_kind))
+      call string_copy(type_string, "i8_kind")
+    type is (real(kind=r4_kind))
       call string_copy(type_string, "float")
-    type is (real(kind=real64))
+    type is (real(kind=r8_kind))
       call string_copy(type_string, "double")
     type is (character(len=*))
       call string_copy(type_string, "char")
@@ -123,13 +123,13 @@ subroutine get_data_type_string_4d(sdata, type_string)
   character(len=*), intent(inout) :: type_string !> Data type.
 
   select type(sdata)
-    type is (integer(kind=int32))
+    type is (integer(kind=i4_kind))
       call string_copy(type_string, "int")
-    type is (integer(kind=int64))
-      call string_copy(type_string, "int64")
-    type is (real(kind=real32))
+    type is (integer(kind=i8_kind))
+      call string_copy(type_string, "i8_kind")
+    type is (real(kind=r4_kind))
       call string_copy(type_string, "float")
-    type is (real(kind=real64))
+    type is (real(kind=r8_kind))
       call string_copy(type_string, "double")
     type is (character(len=*))
       call string_copy(type_string, "char")
@@ -147,13 +147,13 @@ subroutine get_data_type_string_5d(sdata, type_string)
   character(len=*), intent(inout) :: type_string !> Data type.
 
   select type(sdata)
-    type is (integer(kind=int32))
+    type is (integer(kind=i4_kind))
       call string_copy(type_string, "int")
-    type is (integer(kind=int64))
-      call string_copy(type_string, "int64")
-    type is (real(kind=real32))
+    type is (integer(kind=i8_kind))
+      call string_copy(type_string, "i8_kind")
+    type is (real(kind=r4_kind))
       call string_copy(type_string, "float")
-    type is (real(kind=real64))
+    type is (real(kind=r8_kind))
       call string_copy(type_string, "double")
     type is (character(len=*))
       call string_copy(type_string, "char")
diff --git a/fms2_io/include/get_variable_attribute.inc b/fms2_io/include/get_variable_attribute.inc
index 00316b77e7..e4edd22ba4 100644
--- a/fms2_io/include/get_variable_attribute.inc
+++ b/fms2_io/include/get_variable_attribute.inc
@@ -56,13 +56,13 @@ subroutine get_variable_attribute_0d(fileobj, variable_name, attribute_name, &
         else
            call string_copy(attribute_value, charbuf(1), check_for_null=.true.)
         endif
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_get_att(fileobj%ncid, varid, trim(attribute_name), attribute_value)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_get_att(fileobj%ncid, varid, trim(attribute_name), attribute_value)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_get_att(fileobj%ncid, varid, trim(attribute_name), attribute_value)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_get_att(fileobj%ncid, varid, trim(attribute_name), attribute_value)
       class default
         call error("unsupported type.")
@@ -82,13 +82,13 @@ subroutine get_variable_attribute_0d(fileobj, variable_name, attribute_name, &
       else
          call string_copy(attribute_value, charbuf(1), check_for_null=.true.)
       endif
-    type is (integer(kind=int32))
+    type is (integer(kind=i4_kind))
       call mpp_broadcast(attribute_value, fileobj%io_root, pelist=fileobj%pelist)
-    type is (integer(kind=int64))
+    type is (integer(kind=i8_kind))
       call mpp_broadcast(attribute_value, fileobj%io_root, pelist=fileobj%pelist)
-    type is (real(kind=real32))
+    type is (real(kind=r4_kind))
       call mpp_broadcast(attribute_value, fileobj%io_root, pelist=fileobj%pelist)
-    type is (real(kind=real64))
+    type is (real(kind=r8_kind))
       call mpp_broadcast(attribute_value, fileobj%io_root, pelist=fileobj%pelist)
     class default
       call error("unsupported type.")
@@ -117,13 +117,13 @@ subroutine get_variable_attribute_1d(fileobj, variable_name, attribute_name, &
   if (fileobj%is_root) then
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(attribute_value)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_get_att(fileobj%ncid, varid, trim(attribute_name), attribute_value)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_get_att(fileobj%ncid, varid, trim(attribute_name), attribute_value)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_get_att(fileobj%ncid, varid, trim(attribute_name), attribute_value)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_get_att(fileobj%ncid, varid, trim(attribute_name), attribute_value)
       class default
         call error("unsupported type.")
@@ -136,16 +136,16 @@ subroutine get_variable_attribute_1d(fileobj, variable_name, attribute_name, &
     endif
   endif
   select type(attribute_value)
-    type is (integer(kind=int32))
+    type is (integer(kind=i4_kind))
       call mpp_broadcast(attribute_value, size(attribute_value), fileobj%io_root, &
                          pelist=fileobj%pelist)
-    type is (integer(kind=int64))
+    type is (integer(kind=i8_kind))
       call mpp_broadcast(attribute_value, size(attribute_value), fileobj%io_root, &
                          pelist=fileobj%pelist)
-    type is (real(kind=real32))
+    type is (real(kind=r4_kind))
       call mpp_broadcast(attribute_value, size(attribute_value), fileobj%io_root, &
                          pelist=fileobj%pelist)
-    type is (real(kind=real64))
+    type is (real(kind=r8_kind))
       call mpp_broadcast(attribute_value, size(attribute_value), fileobj%io_root, &
                          pelist=fileobj%pelist)
     class default
diff --git a/fms2_io/include/netcdf_read_data.inc b/fms2_io/include/netcdf_read_data.inc
index 3f65c55435..4362d2a8d9 100644
--- a/fms2_io/include/netcdf_read_data.inc
+++ b/fms2_io/include/netcdf_read_data.inc
@@ -63,13 +63,13 @@ subroutine netcdf_read_data_0d(fileobj, variable_name, buf, unlim_dim_level, &
   if (fileobj%is_root) then
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c)
       type is (character(len=*))
         start(:) = 1
@@ -106,13 +106,13 @@ subroutine netcdf_read_data_0d(fileobj, variable_name, buf, unlim_dim_level, &
   endif
   if (bcast) then
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         call mpp_broadcast(buf, fileobj%io_root, pelist=fileobj%pelist)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         call mpp_broadcast(buf, fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         call mpp_broadcast(buf, fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         call mpp_broadcast(buf, fileobj%io_root, pelist=fileobj%pelist)
       type is (character(len=*))
         call string_copy(buf1d(1), buf)
@@ -190,13 +190,13 @@ subroutine netcdf_read_data_1d(fileobj, variable_name, buf, unlim_dim_level, &
   if (fileobj%is_root) then
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
       type is (character(len=*))
         ndims = get_variable_num_dimensions(fileobj, variable_name, broadcast=.false.)
@@ -234,13 +234,13 @@ subroutine netcdf_read_data_1d(fileobj, variable_name, buf, unlim_dim_level, &
   endif
   if (bcast) then
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
       type is (character(len=*))
         call mpp_broadcast(buf, len(buf(1)), fileobj%io_root, pelist=fileobj%pelist)
@@ -309,13 +309,13 @@ subroutine netcdf_read_data_2d(fileobj, variable_name, buf, unlim_dim_level, &
   if (fileobj%is_root) then
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
       class default
         call error("unsupported type.")
@@ -324,13 +324,13 @@ subroutine netcdf_read_data_2d(fileobj, variable_name, buf, unlim_dim_level, &
   endif
   if (bcast) then
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
       class default
         call error("unsupported type.")
@@ -397,13 +397,13 @@ subroutine netcdf_read_data_3d(fileobj, variable_name, buf, unlim_dim_level, &
   if (fileobj%is_root) then
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
       class default
         call error("unsupported type.")
@@ -412,13 +412,13 @@ subroutine netcdf_read_data_3d(fileobj, variable_name, buf, unlim_dim_level, &
   endif
   if (bcast) then
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
       class default
         call error("unsupported type.")
@@ -485,13 +485,13 @@ subroutine netcdf_read_data_4d(fileobj, variable_name, buf, unlim_dim_level, &
   if (fileobj%is_root) then
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
       class default
         call error("unsupported type.")
@@ -500,13 +500,13 @@ subroutine netcdf_read_data_4d(fileobj, variable_name, buf, unlim_dim_level, &
   endif
   if (bcast) then
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
       class default
         call error("unsupported type.")
@@ -573,13 +573,13 @@ subroutine netcdf_read_data_5d(fileobj, variable_name, buf, unlim_dim_level, &
   if (fileobj%is_root) then
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_get_var(fileobj%ncid, varid, buf, start=c, count=e)
       class default
         call error("unsupported type.")
@@ -588,13 +588,13 @@ subroutine netcdf_read_data_5d(fileobj, variable_name, buf, unlim_dim_level, &
   endif
   if (bcast) then
     select type(buf)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         call mpp_broadcast(buf, size(buf), fileobj%io_root, pelist=fileobj%pelist)
       class default
         call error("unsupported type.")
diff --git a/fms2_io/include/netcdf_write_data.inc b/fms2_io/include/netcdf_write_data.inc
index b44b080242..8634fedec5 100644
--- a/fms2_io/include/netcdf_write_data.inc
+++ b/fms2_io/include/netcdf_write_data.inc
@@ -57,13 +57,13 @@ subroutine netcdf_write_data_0d(fileobj, variable_name, variable_data, unlim_dim
     call set_netcdf_mode(fileobj%ncid, data_mode)
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(variable_data)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c)
       type is (character(len=*))
         ndims = get_variable_num_dimensions(fileobj, variable_name, broadcast=.false.)
@@ -148,13 +148,13 @@ subroutine netcdf_write_data_1d(fileobj, variable_name, variable_data, unlim_dim
     call set_netcdf_mode(fileobj%ncid, data_mode)
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(variable_data)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
       type is (character(len=*))
         ndims = get_variable_num_dimensions(fileobj, variable_name, broadcast=.false.)
@@ -237,13 +237,13 @@ subroutine netcdf_write_data_2d(fileobj, variable_name, variable_data, unlim_dim
     call set_netcdf_mode(fileobj%ncid, data_mode)
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(variable_data)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c ,count=e)
       class default
         call error("unsupported type.")
@@ -299,13 +299,13 @@ subroutine netcdf_write_data_3d(fileobj, variable_name, variable_data, unlim_dim
     call set_netcdf_mode(fileobj%ncid, data_mode)
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(variable_data)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
       class default
         call error("unsupported type.")
@@ -359,13 +359,13 @@ subroutine netcdf_write_data_4d(fileobj, variable_name, variable_data, unlim_dim
     call set_netcdf_mode(fileobj%ncid, data_mode)
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(variable_data)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
       class default
         call error("unsupported type.")
@@ -422,13 +422,13 @@ subroutine netcdf_write_data_5d(fileobj, variable_name, variable_data, unlim_dim
     call set_netcdf_mode(fileobj%ncid, data_mode)
     varid = get_variable_id(fileobj%ncid, trim(variable_name))
     select type(variable_data)
-      type is (integer(kind=int32))
+      type is (integer(kind=i4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (integer(kind=int64))
+      type is (integer(kind=i8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real32))
+      type is (real(kind=r4_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
-      type is (real(kind=real64))
+      type is (real(kind=r8_kind))
         err = nf90_put_var(fileobj%ncid, varid, variable_data, start=c, count=e)
       class default
         call error("unsupported type.")
diff --git a/fms2_io/include/register_global_attribute.inc b/fms2_io/include/register_global_attribute.inc
index 3432c5d1e6..17932643db 100644
--- a/fms2_io/include/register_global_attribute.inc
+++ b/fms2_io/include/register_global_attribute.inc
@@ -36,22 +36,22 @@ subroutine register_global_attribute_0d(fileobj, &
                                    nf90_global, &
                                    trim(attribute_name), &
                                    trim(attribute_value(1:str_len)))
-            type is (integer(kind=int32))
+            type is (integer(kind=i4_kind))
                 err = nf90_put_att(fileobj%ncid, &
                                    nf90_global, &
                                    trim(attribute_name), &
                                    attribute_value)
-            type is (integer(kind=int64))
+            type is (integer(kind=i8_kind))
                 err = nf90_put_att(fileobj%ncid, &
                                    nf90_global, &
                                    trim(attribute_name), &
                                    attribute_value)
-            type is (real(kind=real32))
+            type is (real(kind=r4_kind))
                 err = nf90_put_att(fileobj%ncid, &
                                    nf90_global, &
                                    trim(attribute_name), &
                                    attribute_value)
-            type is (real(kind=real64))
+            type is (real(kind=r8_kind))
                 err = nf90_put_att(fileobj%ncid, &
                                    nf90_global, &
                                    trim(attribute_name), &
@@ -76,22 +76,22 @@ subroutine register_global_attribute_1d(fileobj, &
                              define_mode)
         select type(attribute_value)
 
-            type is (integer(kind=int32))
+            type is (integer(kind=i4_kind))
                 err = nf90_put_att(fileobj%ncid, &
                                    nf90_global, &
                                    trim(attribute_name), &
                                    attribute_value)
-            type is (integer(kind=int64))
+            type is (integer(kind=i8_kind))
                 err = nf90_put_att(fileobj%ncid, &
                                    nf90_global, &
                                    trim(attribute_name), &
                                    attribute_value)
-            type is (real(kind=real32))
+            type is (real(kind=r4_kind))
                 err = nf90_put_att(fileobj%ncid, &
                                    nf90_global, &
                                    trim(attribute_name), &
                                    attribute_value)
-            type is (real(kind=real64))
+            type is (real(kind=r8_kind))
                 err = nf90_put_att(fileobj%ncid, &
                                    nf90_global, &
                                    trim(attribute_name), &
diff --git a/fms2_io/netcdf_io.F90 b/fms2_io/netcdf_io.F90
index 6cbcb64bc5..8bd4d8a51a 100644
--- a/fms2_io/netcdf_io.F90
+++ b/fms2_io/netcdf_io.F90
@@ -26,6 +26,7 @@ module netcdf_io_mod
 use netcdf
 use mpp_mod
 use fms_io_utils_mod
+use platform_mod
 implicit none
 private
 
diff --git a/horiz_interp/Makefile.am b/horiz_interp/Makefile.am
index 2d39b19d2b..8ed94ee78d 100644
--- a/horiz_interp/Makefile.am
+++ b/horiz_interp/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build these uninstalled convenience libraries.
 noinst_LTLIBRARIES = libhoriz_interp_type.la			\
diff --git a/memutils/Makefile.am b/memutils/Makefile.am
index 24ffde744c..90a41122a5 100644
--- a/memutils/Makefile.am
+++ b/memutils/Makefile.am
@@ -23,6 +23,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # The convenience library depends on its source.
 libmemutils_la_SOURCES = memutils.F90
diff --git a/monin_obukhov/Makefile.am b/monin_obukhov/Makefile.am
index 4fe63a23e7..7a76a76810 100644
--- a/monin_obukhov/Makefile.am
+++ b/monin_obukhov/Makefile.am
@@ -23,6 +23,7 @@
 # Ed Hartnett 2/22/19
 
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 noinst_LTLIBRARIES = libmonin_obukhov.la libmonin_obukhov_inter.la
 
diff --git a/monin_obukhov/monin_obukhov_inter.F90 b/monin_obukhov/monin_obukhov_inter.F90
index 7165c1de05..e49da7da23 100644
--- a/monin_obukhov/monin_obukhov_inter.F90
+++ b/monin_obukhov/monin_obukhov_inter.F90
@@ -19,7 +19,6 @@
 
 
 module monin_obukhov_inter
-#include <fms_platform.h>
 implicit none
 private
 
@@ -38,7 +37,7 @@ module monin_obukhov_inter
 contains
 
 
-_PURE subroutine monin_obukhov_diff(vonkarm,                &
+pure subroutine monin_obukhov_diff(vonkarm,                &
      & ustar_min,                                     &
      & neutral, stable_option,new_mo_option,rich_crit, zeta_trans, &
      & ni, nj, nk, z, u_star, b_star, k_m, k_h, ier)
@@ -86,7 +85,7 @@ _PURE subroutine monin_obukhov_diff(vonkarm,                &
 end subroutine monin_obukhov_diff
 
 
-_PURE subroutine monin_obukhov_drag_1d(grav, vonkarm,               &
+pure subroutine monin_obukhov_drag_1d(grav, vonkarm,               &
      & error, zeta_min, max_iter, small,                         &
      & neutral, stable_option, new_mo_option, rich_crit, zeta_trans,&
      & drag_min_heat, drag_min_moist, drag_min_mom,              &
@@ -197,7 +196,7 @@ _PURE subroutine monin_obukhov_drag_1d(grav, vonkarm,               &
 end subroutine monin_obukhov_drag_1d
 
 
-_PURE subroutine monin_obukhov_solve_zeta(error, zeta_min, max_iter, small,  &
+pure subroutine monin_obukhov_solve_zeta(error, zeta_min, max_iter, small,  &
      & stable_option, new_mo_option, rich_crit, zeta_trans,        & !miz
      & n, rich, z, z0, zt, zq, f_m, f_t, f_q, mask, ier)
 
@@ -314,7 +313,7 @@ end subroutine monin_obukhov_solve_zeta
 
 ! the differential similarity function for buoyancy and tracers
 ! Note: seems to be the same as monin_obukhov_derivative_m?
-_PURE subroutine monin_obukhov_derivative_t(stable_option,new_mo_option,rich_crit, zeta_trans, &
+pure subroutine monin_obukhov_derivative_t(stable_option,new_mo_option,rich_crit, zeta_trans, &
      & n, phi_t, zeta, mask, ier)
 
   integer, intent(in   )                :: stable_option
@@ -370,7 +369,7 @@ end subroutine monin_obukhov_derivative_t
 
 
 ! the differential similarity function for momentum
-_PURE subroutine monin_obukhov_derivative_m(stable_option, rich_crit, zeta_trans, &
+pure subroutine monin_obukhov_derivative_m(stable_option, rich_crit, zeta_trans, &
      & n, phi_m, zeta, mask, ier)
 
   integer, intent(in   )                :: stable_option
@@ -418,7 +417,7 @@ _PURE subroutine monin_obukhov_derivative_m(stable_option, rich_crit, zeta_trans
 end subroutine monin_obukhov_derivative_m
 
 
-_PURE subroutine monin_obukhov_profile_1d(vonkarm, &
+pure subroutine monin_obukhov_profile_1d(vonkarm, &
      & neutral, stable_option, new_mo_option, rich_crit, zeta_trans, &
      & n, zref, zref_t, z, z0, zt, zq, u_star, b_star, q_star, &
      & del_m, del_t, del_q, lavail, avail, ier)
@@ -502,7 +501,7 @@ end subroutine monin_obukhov_profile_1d
 
 
 !  the integral similarity function for momentum
-_PURE subroutine monin_obukhov_integral_m(stable_option, rich_crit, zeta_trans, &
+pure subroutine monin_obukhov_integral_m(stable_option, rich_crit, zeta_trans, &
      & n, psi_m, zeta, zeta_0, ln_z_z0, mask, ier)
 
   integer, intent(in   )                :: stable_option
@@ -578,7 +577,7 @@ end subroutine monin_obukhov_integral_m
 
 
 ! the integral similarity function for moisture and tracers
-_PURE subroutine monin_obukhov_integral_tq(stable_option, new_mo_option, rich_crit, zeta_trans, &
+pure subroutine monin_obukhov_integral_tq(stable_option, new_mo_option, rich_crit, zeta_trans, &
      & n, psi_t, psi_q, zeta, zeta_t, zeta_q, &
      & ln_z_zt, ln_z_zq, mask, ier)
 
@@ -676,7 +675,7 @@ _PURE subroutine monin_obukhov_integral_tq(stable_option, new_mo_option, rich_cr
 end subroutine monin_obukhov_integral_tq
 
 
-_PURE subroutine monin_obukhov_stable_mix(stable_option, rich_crit, zeta_trans, &
+pure subroutine monin_obukhov_stable_mix(stable_option, rich_crit, zeta_trans, &
      &                              n, rich, mix, ier)
 
   integer, intent(in   )                 :: stable_option
diff --git a/mosaic/Makefile.am b/mosaic/Makefile.am
index 2f876a03e4..b804e4c99c 100644
--- a/mosaic/Makefile.am
+++ b/mosaic/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build these uninstalled convenience libraries.
 
diff --git a/mpp/Makefile.am b/mpp/Makefile.am
index 85694c6727..7729c65b1b 100644
--- a/mpp/Makefile.am
+++ b/mpp/Makefile.am
@@ -42,7 +42,8 @@ include/mpp_io_read.inc include/mpp_io_util.inc				\
 include/mpp_sum_ad.inc include/mpp_unstruct_domain.inc
 
 # Include .h and .mod files.
-AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include -I${top_srcdir}/mpp/include
+AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include -I${top_srcdir}/mpp/include \
+  -I${top_builddir}/platform
 
 # Build these uninstalled convenience libraries.
 noinst_LTLIBRARIES = libmpp_parameter.la libmpp_data.la libmpp.la \
diff --git a/mpp/include/mpp_chksum.h b/mpp/include/mpp_chksum.h
index e60f8a6741..401417c295 100644
--- a/mpp/include/mpp_chksum.h
+++ b/mpp/include/mpp_chksum.h
@@ -20,9 +20,9 @@
 function MPP_CHKSUM_( var, pelist , mask_val)
 !mold is a dummy array to be used by TRANSFER()
 !must be same TYPE as result
-!result is LONG_KIND, which will actually be int ifdef no_8byte_integers
+!result is i8_kind, which will actually be int ifdef no_8byte_integers
   !optional mask_val is masked away in checksum_int.h function via PACK()
-  integer(LONG_KIND) :: MPP_CHKSUM_
+  integer(i8_kind) :: MPP_CHKSUM_
   integer(MPP_TRANSFER_KIND_) :: mold(1)
       MPP_TYPE_, intent(in) :: var MPP_RANK_
       integer, intent(in), optional :: pelist(:)
diff --git a/mpp/include/mpp_chksum_int.h b/mpp/include/mpp_chksum_int.h
index 9071c5b0b9..7a3c6af90e 100644
--- a/mpp/include/mpp_chksum_int.h
+++ b/mpp/include/mpp_chksum_int.h
@@ -18,7 +18,7 @@
 !***********************************************************************
 
 function MPP_CHKSUM_INT_( var, pelist, mask_val )
-  integer(LONG_KIND) :: MPP_CHKSUM_INT_
+  integer(i8_kind) :: MPP_CHKSUM_INT_
       MPP_TYPE_, intent(in) :: var MPP_RANK_
       integer, optional :: pelist(:)
   MPP_TYPE_, intent(in), optional :: mask_val
@@ -27,9 +27,9 @@ function MPP_CHKSUM_INT_( var, pelist, mask_val )
      !PACK on var/=mask_val ignores values in var
      !equiv to setting those values=0, but on sparse arrays
      !pack should return much smaller array to sum
-     MPP_CHKSUM_INT_ = sum( INT( PACK(var,var/=mask_val),LONG_KIND) )
+     MPP_CHKSUM_INT_ = sum( INT( PACK(var,var/=mask_val),i8_kind) )
   else
-     MPP_CHKSUM_INT_ = sum(INT(var,LONG_KIND))
+     MPP_CHKSUM_INT_ = sum(INT(var,i8_kind))
   end if
 
       call mpp_sum( MPP_CHKSUM_INT_, pelist )
@@ -41,14 +41,14 @@ function MPP_CHKSUM_INT_( var, pelist, mask_val )
 !Handles real mask for easier implimentation
 ! until exists full integer vartypes...
 function MPP_CHKSUM_INT_RMASK_( var, pelist, mask_val )
-  integer(LONG_KIND) :: MPP_CHKSUM_INT_RMASK_
+  integer(i8_kind) :: MPP_CHKSUM_INT_RMASK_
   MPP_TYPE_, intent(in) :: var MPP_RANK_
   integer, optional :: pelist(:)
   real, intent(in) :: mask_val
   integer(KIND(var))::imask_val
-  integer(INT_KIND)::i4tmp(2)=0
-  real(FLOAT_KIND)::r4tmp(2)=0
-  integer(LONG_KIND) :: i8tmp=0
+  integer(i4_kind)::i4tmp(2)=0
+  real(r4_kind)::r4tmp(2)=0
+  integer(i8_kind) :: i8tmp=0
   !high fidelity error message
   character(LEN=1) :: tmpStr1,tmpStr2,tmpStr3
   character(LEN=32) :: tmpStr4,tmpStr5
@@ -61,7 +61,7 @@ function MPP_CHKSUM_INT_RMASK_( var, pelist, mask_val )
      ! we've packed an MPP_FILL_
      imask_val = MPP_FILL_INT
   !!! Current NETCDF fill values (AKA MPP_FILL_*) designed towards CEILING(MPP_FILL_{FLOAT,DOUBLE},kind=4byte)=MPP_FILL_INT
-  else if ( CEILING(mask_val,INT_KIND) == MPP_FILL_INT ) then
+  else if ( CEILING(mask_val, i4_kind) == MPP_FILL_INT ) then
      ! we've also packed an MPP_FILL_
      imask_val = MPP_FILL_INT
 ! Secondary Logic:
diff --git a/mpp/include/mpp_chksum_scalar.h b/mpp/include/mpp_chksum_scalar.h
index 1e153604a6..ccafffafcb 100644
--- a/mpp/include/mpp_chksum_scalar.h
+++ b/mpp/include/mpp_chksum_scalar.h
@@ -20,12 +20,12 @@
 function MPP_CHKSUM_( var, pelist, mask_val )
 !mold is a dummy array to be used by TRANSFER()
 !must be same TYPE as result
-!result is LONG_KIND, which will actually be int ifdef no_8byte_integers
+!result is i8_kind, which will actually be int ifdef no_8byte_integers
   !mold and mask_val must be same numBytes, otherwise undefined behavior
-      integer(LONG_KIND) :: MPP_CHKSUM_
+      integer(i8_kind) :: MPP_CHKSUM_
       MPP_TYPE_, intent(in) :: var
       integer, intent(in), optional :: pelist(:)
-      integer(LONG_KIND) :: mold(1)
+      integer(i8_kind) :: mold(1)
   MPP_TYPE_, intent(in), optional :: mask_val
       pointer( p, mold )
 
diff --git a/mpp/include/mpp_comm.inc b/mpp/include/mpp_comm.inc
index b5d0ca6d2f..bbb3d3e256 100644
--- a/mpp/include/mpp_comm.inc
+++ b/mpp/include/mpp_comm.inc
@@ -25,13 +25,12 @@
 #include <mpp_comm_nocomm.inc>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_CHKSUM_INT_
 #define MPP_CHKSUM_INT_ mpp_chksum_i8_1d
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i8_1d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:)
 #include <mpp_chksum_int.h>
@@ -41,7 +40,7 @@
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i8_2d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:,:)
 #include <mpp_chksum_int.h>
@@ -51,7 +50,7 @@
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i8_3d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:,:,:)
 #include <mpp_chksum_int.h>
@@ -61,7 +60,7 @@
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i8_4d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:,:,:,:)
 #include <mpp_chksum_int.h>
@@ -71,18 +70,17 @@
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i8_5d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:,:,:,:,:)
 #include <mpp_chksum_int.h>
-#endif
 
 #undef MPP_CHKSUM_INT_
 #define MPP_CHKSUM_INT_ mpp_chksum_i4_1d
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i4_1d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:)
 #include <mpp_chksum_int.h>
@@ -92,7 +90,7 @@
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i4_2d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:,:)
 #include <mpp_chksum_int.h>
@@ -102,7 +100,7 @@
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i4_3d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:,:,:)
 #include <mpp_chksum_int.h>
@@ -112,7 +110,7 @@
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i4_4d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:,:,:,:)
 #include <mpp_chksum_int.h>
@@ -122,7 +120,7 @@
 #undef MPP_CHKSUM_INT_RMASK_
 #define MPP_CHKSUM_INT_RMASK_ mpp_chksum_i4_5d_rmask
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_  (:,:,:,:,:)
 #include <mpp_chksum_int.h>
@@ -130,7 +128,7 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r8_0d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ !
 #include <mpp_chksum_scalar.h>
@@ -138,9 +136,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r8_1d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:)
 #include <mpp_chksum.h>
@@ -148,9 +146,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r8_2d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:)
 #include <mpp_chksum.h>
@@ -158,9 +156,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r8_3d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:)
 #include <mpp_chksum.h>
@@ -168,9 +166,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r8_4d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:,:)
 #include <mpp_chksum.h>
@@ -178,9 +176,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r8_5d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:,:,:)
 #include <mpp_chksum.h>
@@ -189,9 +187,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c8_0d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ !
 #include <mpp_chksum_scalar.h>
@@ -199,9 +197,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c8_1d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:)
 #include <mpp_chksum.h>
@@ -209,9 +207,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c8_2d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:)
 #include <mpp_chksum.h>
@@ -219,9 +217,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c8_3d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:)
 #include <mpp_chksum.h>
@@ -229,9 +227,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c8_4d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:,:)
 #include <mpp_chksum.h>
@@ -239,21 +237,20 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c8_5d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ LONG_KIND
+#define MPP_TRANSFER_KIND_ i8_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:,:,:)
 #include <mpp_chksum.h>
 #endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r4_0d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ !
 #include <mpp_chksum_scalar.h>
@@ -261,9 +258,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r4_1d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:)
 #include <mpp_chksum.h>
@@ -271,9 +268,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r4_2d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:)
 #include <mpp_chksum.h>
@@ -281,9 +278,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r4_3d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:)
 #include <mpp_chksum.h>
@@ -291,9 +288,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r4_4d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:,:)
 #include <mpp_chksum.h>
@@ -301,21 +298,20 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_r4_5d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:,:,:)
 #include <mpp_chksum.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c4_0d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ !
 #include <mpp_chksum_scalar.h>
@@ -323,9 +319,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c4_1d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:)
 #include <mpp_chksum.h>
@@ -333,9 +329,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c4_2d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:)
 #include <mpp_chksum.h>
@@ -343,9 +339,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c4_3d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:)
 #include <mpp_chksum.h>
@@ -353,9 +349,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c4_4d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:,:)
 #include <mpp_chksum.h>
@@ -363,9 +359,9 @@
 #undef MPP_CHKSUM_
 #define MPP_CHKSUM_ mpp_chksum_c4_5d
 #undef MPP_TRANSFER_KIND_
-#define MPP_TRANSFER_KIND_ INT_KIND
+#define MPP_TRANSFER_KIND_ i4_kind
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_RANK_
 #define MPP_RANK_ (:,:,:,:,:)
 #include <mpp_chksum.h>
@@ -387,7 +383,7 @@
 #undef MPP_GATHER_1D_
 #undef MPP_GATHER_1DV_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #define MPP_GATHER_1D_ mpp_gather_int4_1d
 #define MPP_GATHER_1DV_ mpp_gather_int4_1dv
 #undef  MPP_GATHER_PELIST_2D_
@@ -399,7 +395,7 @@
 #undef MPP_GATHER_1D_
 #undef MPP_GATHER_1DV_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #define MPP_GATHER_1D_ mpp_gather_real4_1d
 #define MPP_GATHER_1DV_ mpp_gather_real4_1dv
 #undef  MPP_GATHER_PELIST_2D_
@@ -411,7 +407,7 @@
 #undef MPP_GATHER_1D_
 #undef MPP_GATHER_1DV_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #define MPP_GATHER_1D_ mpp_gather_real8_1d
 #define MPP_GATHER_1DV_ mpp_gather_real8_1dv
 #undef  MPP_GATHER_PELIST_2D_
@@ -424,7 +420,7 @@
 #undef  MPP_SCATTER_PELIST_2D_
 #undef  MPP_SCATTER_PELIST_3D_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #define MPP_SCATTER_PELIST_2D_ mpp_scatter_pelist_int4_2d
 #define MPP_SCATTER_PELIST_3D_ mpp_scatter_pelist_int4_3d
 #include <mpp_scatter.h>
@@ -432,7 +428,7 @@
 #undef  MPP_SCATTER_PELIST_2D_
 #undef  MPP_SCATTER_PELIST_3D_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #define MPP_SCATTER_PELIST_2D_ mpp_scatter_pelist_real4_2d
 #define MPP_SCATTER_PELIST_3D_ mpp_scatter_pelist_real4_3d
 #include <mpp_scatter.h>
@@ -440,7 +436,7 @@
 #undef  MPP_SCATTER_PELIST_2D_
 #undef  MPP_SCATTER_PELIST_3D_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #define MPP_SCATTER_PELIST_2D_ mpp_scatter_pelist_real8_2d
 #define MPP_SCATTER_PELIST_3D_ mpp_scatter_pelist_real8_3d
 #include <mpp_scatter.h>
diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc
index c776be6620..d3fb1bc190 100644
--- a/mpp/include/mpp_comm_mpi.inc
+++ b/mpp/include/mpp_comm_mpi.inc
@@ -447,7 +447,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_real8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -504,7 +504,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_cmplx8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 16
 #undef MPI_TYPE_
@@ -561,7 +561,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_real4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -618,7 +618,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_cmplx4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -626,7 +626,6 @@ end subroutine mpp_exit
 #include <mpp_transmit_mpi.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_TRANSMIT_
 #define MPP_TRANSMIT_ mpp_transmit_int8
 #undef MPP_TRANSMIT_SCALAR_
@@ -676,13 +675,12 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_int8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_transmit_mpi.h>
-#endif
 
 #undef MPP_TRANSMIT_
 #define MPP_TRANSMIT_ mpp_transmit_int4
@@ -733,14 +731,13 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_int4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_transmit_mpi.h>
 
-#ifndef no_8byte_integers
 #undef MPP_TRANSMIT_
 #define MPP_TRANSMIT_ mpp_transmit_logical8
 #undef MPP_TRANSMIT_SCALAR_
@@ -790,13 +787,12 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_logical8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_transmit_mpi.h>
-#endif
 
 #undef MPP_TRANSMIT_
 #define MPP_TRANSMIT_ mpp_transmit_logical4
@@ -847,7 +843,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_logical4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -864,7 +860,7 @@ end subroutine mpp_exit
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_max_real8_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -873,13 +869,12 @@ end subroutine mpp_exit
 #define MPI_REDUCE_ MPI_MAX
 #include <mpp_reduce_mpi.h>
 
-#ifdef OVERLOAD_R4
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_max_real4_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_max_real4_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -887,15 +882,13 @@ end subroutine mpp_exit
 #undef MPI_REDUCE_
 #define MPI_REDUCE_ MPI_MAX
 #include <mpp_reduce_mpi.h>
-#endif
 
-#ifndef no_8byte_integers
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_max_int8_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_max_int8_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -903,14 +896,13 @@ end subroutine mpp_exit
 #undef MPI_REDUCE_
 #define MPI_REDUCE_ MPI_MAX
 #include <mpp_reduce_mpi.h>
-#endif
 
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_max_int4_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_max_int4_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -924,7 +916,7 @@ end subroutine mpp_exit
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_min_real8_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -933,13 +925,12 @@ end subroutine mpp_exit
 #define MPI_REDUCE_ MPI_MIN
 #include <mpp_reduce_mpi.h>
 
-#ifdef OVERLOAD_R4
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_min_real4_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_min_real4_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -947,15 +938,13 @@ end subroutine mpp_exit
 #undef MPI_REDUCE_
 #define MPI_REDUCE_ MPI_MIN
 #include <mpp_reduce_mpi.h>
-#endif
 
-#ifndef no_8byte_integers
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_min_int8_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_min_int8_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -963,14 +952,13 @@ end subroutine mpp_exit
 #undef MPI_REDUCE_
 #define MPI_REDUCE_ MPI_MIN
 #include <mpp_reduce_mpi.h>
-#endif
 
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_min_int4_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_min_int4_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -992,7 +980,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_real8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL8
 #undef MPP_TYPE_BYTELEN_
@@ -1013,7 +1001,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_cmplx8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_DOUBLE_COMPLEX
 #undef MPP_TYPE_BYTELEN_
@@ -1021,7 +1009,6 @@ end subroutine mpp_exit
 #include <mpp_sum_mpi.h>
 #endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_SUM_
 #define MPP_SUM_ mpp_sum_real4
 #undef MPP_SUM_SCALAR_
@@ -1035,13 +1022,12 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_real4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL4
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #include <mpp_sum_mpi.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_SUM_
@@ -1057,7 +1043,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_cmplx4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_COMPLEX
 #undef MPP_TYPE_BYTELEN_
@@ -1065,7 +1051,6 @@ end subroutine mpp_exit
 #include <mpp_sum_mpi.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_SUM_
 #define MPP_SUM_ mpp_sum_int8
 #undef MPP_SUM_SCALAR_
@@ -1079,13 +1064,12 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_int8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER8
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #include <mpp_sum_mpi.h>
-#endif
 
 #undef MPP_SUM_
 #define MPP_SUM_ mpp_sum_int4
@@ -1100,7 +1084,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_int4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER4
 #undef MPP_TYPE_BYTELEN_
@@ -1120,7 +1104,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_real8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL8
 #undef MPP_TYPE_BYTELEN_
@@ -1141,7 +1125,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_cmplx8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_DOUBLE_COMPLEX
 #undef MPP_TYPE_BYTELEN_
@@ -1149,7 +1133,6 @@ end subroutine mpp_exit
 #include <mpp_sum_mpi_ad.h>
 #endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_SUM_AD_
 #define MPP_SUM_AD_ mpp_sum_real4_ad
 #undef MPP_SUM_SCALAR_AD_
@@ -1163,13 +1146,12 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_real4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL4
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #include <mpp_sum_mpi_ad.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_SUM_AD_
@@ -1185,7 +1167,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_cmplx4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_COMPLEX
 #undef MPP_TYPE_BYTELEN_
@@ -1193,7 +1175,6 @@ end subroutine mpp_exit
 #include <mpp_sum_mpi_ad.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_SUM_AD_
 #define MPP_SUM_AD_ mpp_sum_int8_ad
 #undef MPP_SUM_SCALAR_AD_
@@ -1207,13 +1188,12 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_int8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER8
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #include <mpp_sum_mpi_ad.h>
-#endif
 
 #undef MPP_SUM_AD_
 #define MPP_SUM_AD_ mpp_sum_int4_ad
@@ -1228,7 +1208,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_int4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER4
 #undef MPP_TYPE_BYTELEN_
@@ -1250,7 +1230,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_int4
 #define MPP_ALLTOALLV_ mpp_alltoall_int4_v
 #define MPP_ALLTOALLW_ mpp_alltoall_int4_w
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #define MPP_TYPE_BYTELEN_ 4
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_alltoall_mpi.h>
@@ -1264,7 +1244,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_int8
 #define MPP_ALLTOALLV_ mpp_alltoall_int8_v
 #define MPP_ALLTOALLW_ mpp_alltoall_int8_w
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #define MPP_TYPE_BYTELEN_ 8
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_alltoall_mpi.h>
@@ -1278,7 +1258,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_real4
 #define MPP_ALLTOALLV_ mpp_alltoall_real4_v
 #define MPP_ALLTOALLW_ mpp_alltoall_real4_w
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #define MPP_TYPE_BYTELEN_ 4
 #define MPI_TYPE_ MPI_REAL4
 #include <mpp_alltoall_mpi.h>
@@ -1292,7 +1272,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_real8
 #define MPP_ALLTOALLV_ mpp_alltoall_real8_v
 #define MPP_ALLTOALLW_ mpp_alltoall_real8_w
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #define MPP_TYPE_BYTELEN_ 8
 #define MPI_TYPE_ MPI_REAL8
 #include <mpp_alltoall_mpi.h>
@@ -1306,7 +1286,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_cmplx4
 #define MPP_ALLTOALLV_ mpp_alltoall_cmplx4_v
 #define MPP_ALLTOALLW_ mpp_alltoall_cmplx4_w
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #define MPP_TYPE_BYTELEN_ 8
 #define MPI_TYPE_ MPI_COMPLEX8
 #include <mpp_alltoall_mpi.h>
@@ -1320,7 +1300,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_cmplx8
 #define MPP_ALLTOALLV_ mpp_alltoall_cmplx8_v
 #define MPP_ALLTOALLW_ mpp_alltoall_cmplx8_w
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #define MPP_TYPE_BYTELEN_ 16
 #define MPI_TYPE_ MPI_COMPLEX16
 #include <mpp_alltoall_mpi.h>
@@ -1334,7 +1314,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_logical4
 #define MPP_ALLTOALLV_ mpp_alltoall_logical4_v
 #define MPP_ALLTOALLW_ mpp_alltoall_logical4_w
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #define MPP_TYPE_BYTELEN_ 4
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_alltoall_mpi.h>
@@ -1348,7 +1328,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_logical8
 #define MPP_ALLTOALLV_ mpp_alltoall_logical8_v
 #define MPP_ALLTOALLW_ mpp_alltoall_logical8_w
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #define MPP_TYPE_BYTELEN_ 8
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_alltoall_mpi.h>
@@ -1363,7 +1343,7 @@ end subroutine mpp_exit
 #undef MPP_TYPE_
 #undef MPI_TYPE_
 #define MPP_TYPE_CREATE_ mpp_type_create_int4
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_type_mpi.h>
 
@@ -1371,7 +1351,7 @@ end subroutine mpp_exit
 #undef MPP_TYPE_
 #undef MPI_TYPE_
 #define MPP_TYPE_CREATE_ mpp_type_create_int8
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_type_mpi.h>
 
@@ -1379,7 +1359,7 @@ end subroutine mpp_exit
 #undef MPP_TYPE_
 #undef MPI_TYPE_
 #define MPP_TYPE_CREATE_ mpp_type_create_real4
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #define MPI_TYPE_ MPI_REAL4
 #include <mpp_type_mpi.h>
 
@@ -1387,7 +1367,7 @@ end subroutine mpp_exit
 #undef MPP_TYPE_
 #undef MPI_TYPE_
 #define MPP_TYPE_CREATE_ mpp_type_create_real8
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #define MPI_TYPE_ MPI_REAL8
 #include <mpp_type_mpi.h>
 
@@ -1395,7 +1375,7 @@ end subroutine mpp_exit
 #undef MPP_TYPE_
 #undef MPI_TYPE_
 #define MPP_TYPE_CREATE_ mpp_type_create_cmplx4
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #define MPI_TYPE_ MPI_COMPLEX8
 #include <mpp_type_mpi.h>
 
@@ -1403,7 +1383,7 @@ end subroutine mpp_exit
 #undef MPP_TYPE_
 #undef MPI_TYPE_
 #define MPP_TYPE_CREATE_ mpp_type_create_cmplx8
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #define MPI_TYPE_ MPI_COMPLEX16
 #include <mpp_type_mpi.h>
 
@@ -1411,7 +1391,7 @@ end subroutine mpp_exit
 #undef MPP_TYPE_
 #undef MPI_TYPE_
 #define MPP_TYPE_CREATE_ mpp_type_create_logical4
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_type_mpi.h>
 
@@ -1419,7 +1399,7 @@ end subroutine mpp_exit
 #undef MPP_TYPE_
 #undef MPI_TYPE_
 #define MPP_TYPE_CREATE_ mpp_type_create_logical8
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_type_mpi.h>
 
diff --git a/mpp/include/mpp_comm_nocomm.inc b/mpp/include/mpp_comm_nocomm.inc
index 33e8be0f7b..540eba7f7c 100644
--- a/mpp/include/mpp_comm_nocomm.inc
+++ b/mpp/include/mpp_comm_nocomm.inc
@@ -339,7 +339,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_real8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -396,7 +396,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_cmplx8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 16
 #undef MPI_TYPE_
@@ -453,7 +453,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_real4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -510,7 +510,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_cmplx4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -518,7 +518,6 @@ end subroutine mpp_exit
 #include <mpp_transmit_nocomm.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_TRANSMIT_
 #define MPP_TRANSMIT_ mpp_transmit_int8
 #undef MPP_TRANSMIT_SCALAR_
@@ -568,13 +567,12 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_int8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_transmit_nocomm.h>
-#endif
 
 #undef MPP_TRANSMIT_
 #define MPP_TRANSMIT_ mpp_transmit_int4
@@ -625,14 +623,13 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_int4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_transmit_nocomm.h>
 
-#ifndef no_8byte_integers
 #undef MPP_TRANSMIT_
 #define MPP_TRANSMIT_ mpp_transmit_logical8
 #undef MPP_TRANSMIT_SCALAR_
@@ -682,13 +679,12 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_logical8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_transmit_nocomm.h>
-#endif
 
 #undef MPP_TRANSMIT_
 #define MPP_TRANSMIT_ mpp_transmit_logical4
@@ -739,7 +735,7 @@ end subroutine mpp_exit
 #undef MPP_BROADCAST_5D_
 #define MPP_BROADCAST_5D_ mpp_broadcast_logical4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -756,7 +752,7 @@ end subroutine mpp_exit
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_max_real8_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -765,13 +761,12 @@ end subroutine mpp_exit
 #define MPI_REDUCE_ MPI_MAX
 #include <mpp_reduce_nocomm.h>
 
-#ifdef OVERLOAD_R4
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_max_real4_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_max_real4_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -779,15 +774,13 @@ end subroutine mpp_exit
 #undef MPI_REDUCE_
 #define MPI_REDUCE_ MPI_MAX
 #include <mpp_reduce_nocomm.h>
-#endif
 
-#ifndef no_8byte_integers
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_max_int8_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_max_int8_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -795,14 +788,13 @@ end subroutine mpp_exit
 #undef MPI_REDUCE_
 #define MPI_REDUCE_ MPI_MAX
 #include <mpp_reduce_nocomm.h>
-#endif
 
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_max_int4_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_max_int4_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -816,7 +808,7 @@ end subroutine mpp_exit
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_min_real8_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -825,13 +817,12 @@ end subroutine mpp_exit
 #define MPI_REDUCE_ MPI_MIN
 #include <mpp_reduce_nocomm.h>
 
-#ifdef OVERLOAD_R4
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_min_real4_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_min_real4_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -839,15 +830,13 @@ end subroutine mpp_exit
 #undef MPI_REDUCE_
 #define MPI_REDUCE_ MPI_MIN
 #include <mpp_reduce_nocomm.h>
-#endif
 
-#ifndef no_8byte_integers
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_min_int8_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_min_int8_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #undef MPI_TYPE_
@@ -855,14 +844,13 @@ end subroutine mpp_exit
 #undef MPI_REDUCE_
 #define MPI_REDUCE_ MPI_MIN
 #include <mpp_reduce_nocomm.h>
-#endif
 
 #undef MPP_REDUCE_0D_
 #define MPP_REDUCE_0D_ mpp_min_int4_0d
 #undef MPP_REDUCE_1D_
 #define MPP_REDUCE_1D_ mpp_min_int4_1d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #undef MPI_TYPE_
@@ -884,7 +872,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_real8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL8
 #undef MPP_TYPE_BYTELEN_
@@ -905,7 +893,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_cmplx8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_DOUBLE_COMPLEX
 #undef MPP_TYPE_BYTELEN_
@@ -913,7 +901,6 @@ end subroutine mpp_exit
 #include <mpp_sum_nocomm.h>
 #endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_SUM_
 #define MPP_SUM_ mpp_sum_real4
 #undef MPP_SUM_SCALAR_
@@ -927,13 +914,12 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_real4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL4
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #include <mpp_sum_nocomm.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_SUM_
@@ -949,7 +935,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_cmplx4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_COMPLEX
 #undef MPP_TYPE_BYTELEN_
@@ -957,7 +943,6 @@ end subroutine mpp_exit
 #include <mpp_sum_nocomm.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_SUM_
 #define MPP_SUM_ mpp_sum_int8
 #undef MPP_SUM_SCALAR_
@@ -971,13 +956,12 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_int8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER8
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #include <mpp_sum_nocomm.h>
-#endif
 
 #undef MPP_SUM_
 #define MPP_SUM_ mpp_sum_int4
@@ -992,7 +976,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_
 #define MPP_SUM_5D_ mpp_sum_int4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER4
 #undef MPP_TYPE_BYTELEN_
@@ -1012,7 +996,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_real8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL8
 #undef MPP_TYPE_BYTELEN_
@@ -1033,7 +1017,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_cmplx8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_DOUBLE_COMPLEX
 #undef MPP_TYPE_BYTELEN_
@@ -1041,7 +1025,6 @@ end subroutine mpp_exit
 #include <mpp_sum_nocomm_ad.h>
 #endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_SUM_AD_
 #define MPP_SUM_AD_ mpp_sum_real4_ad
 #undef MPP_SUM_SCALAR_AD_
@@ -1055,13 +1038,12 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_real4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL4
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 4
 #include <mpp_sum_nocomm_ad.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_SUM_AD_
@@ -1077,7 +1059,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_cmplx4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_COMPLEX
 #undef MPP_TYPE_BYTELEN_
@@ -1085,7 +1067,6 @@ end subroutine mpp_exit
 #include <mpp_sum_nocomm_ad.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_SUM_AD_
 #define MPP_SUM_AD_ mpp_sum_int8_ad
 #undef MPP_SUM_SCALAR_AD_
@@ -1099,13 +1080,12 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_int8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER8
 #undef MPP_TYPE_BYTELEN_
 #define MPP_TYPE_BYTELEN_ 8
 #include <mpp_sum_nocomm_ad.h>
-#endif
 
 #undef MPP_SUM_AD_
 #define MPP_SUM_AD_ mpp_sum_int4_ad
@@ -1120,7 +1100,7 @@ end subroutine mpp_exit
 #undef MPP_SUM_5D_AD_
 #define MPP_SUM_5D_AD_ mpp_sum_int4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER4
 #undef MPP_TYPE_BYTELEN_
@@ -1142,7 +1122,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_int4
 #define MPP_ALLTOALLV_ mpp_alltoall_int4_v
 #define MPP_ALLTOALLW_ mpp_alltoall_int4_w
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #define MPP_TYPE_BYTELEN_ 4
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_alltoall_nocomm.h>
@@ -1156,7 +1136,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_int8
 #define MPP_ALLTOALLV_ mpp_alltoall_int8_v
 #define MPP_ALLTOALLW_ mpp_alltoall_int8_w
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #define MPP_TYPE_BYTELEN_ 8
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_alltoall_nocomm.h>
@@ -1170,7 +1150,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_real4
 #define MPP_ALLTOALLV_ mpp_alltoall_real4_v
 #define MPP_ALLTOALLW_ mpp_alltoall_real4_w
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #define MPP_TYPE_BYTELEN_ 4
 #define MPI_TYPE_ MPI_REAL4
 #include <mpp_alltoall_nocomm.h>
@@ -1184,7 +1164,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_real8
 #define MPP_ALLTOALLV_ mpp_alltoall_real8_v
 #define MPP_ALLTOALLW_ mpp_alltoall_real8_w
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #define MPP_TYPE_BYTELEN_ 8
 #define MPI_TYPE_ MPI_REAL8
 #include <mpp_alltoall_nocomm.h>
@@ -1198,7 +1178,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_logical4
 #define MPP_ALLTOALLV_ mpp_alltoall_logical4_v
 #define MPP_ALLTOALLW_ mpp_alltoall_logical4_w
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #define MPP_TYPE_BYTELEN_ 4
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_alltoall_nocomm.h>
@@ -1212,7 +1192,7 @@ end subroutine mpp_exit
 #define MPP_ALLTOALL_ mpp_alltoall_logical8
 #define MPP_ALLTOALLV_ mpp_alltoall_logical8_v
 #define MPP_ALLTOALLW_ mpp_alltoall_logical8_w
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #define MPP_TYPE_BYTELEN_ 8
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_alltoall_nocomm.h>
@@ -1223,32 +1203,32 @@ end subroutine mpp_exit
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
 #define MPP_TYPE_CREATE_ mpp_type_create_int4
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_type_nocomm.h>
 
 #define MPP_TYPE_CREATE_ mpp_type_create_int8
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_type_nocomm.h>
 
 #define MPP_TYPE_CREATE_ mpp_type_create_real4
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #define MPI_TYPE_ MPI_REAL4
 #include <mpp_type_nocomm.h>
 
 #define MPP_TYPE_CREATE_ mpp_type_create_real8
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #define MPI_TYPE_ MPI_REAL8
 #include <mpp_type_nocomm.h>
 
 #define MPP_TYPE_CREATE_ mpp_type_create_logical4
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #define MPI_TYPE_ MPI_INTEGER4
 #include <mpp_type_nocomm.h>
 
 #define MPP_TYPE_CREATE_ mpp_type_create_logical8
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #define MPI_TYPE_ MPI_INTEGER8
 #include <mpp_type_nocomm.h>
 
diff --git a/mpp/include/mpp_data_mpi.inc b/mpp/include/mpp_data_mpi.inc
index 4b05e5a365..00839df558 100644
--- a/mpp/include/mpp_data_mpi.inc
+++ b/mpp/include/mpp_data_mpi.inc
@@ -23,7 +23,7 @@
 ! The following data is used in mpp_mod and its components       !
 !----------------------------------------------------------------!
 integer :: stat(MPI_STATUS_SIZE)
-real(DOUBLE_KIND), allocatable :: mpp_stack(:)
+real(r8_kind), allocatable :: mpp_stack(:)
 
 !--- some dummy variables with dummy values that will never be used
 integer, parameter :: ptr_stack = -999
@@ -35,8 +35,8 @@ integer, parameter :: mpp_from_pe = -999, ptr_from = -999
 !-------------------------------------------------------------------!
 ! The following data is used in mpp_domains_mod and its components  !
 !-------------------------------------------------------------------!
-real(DOUBLE_KIND), allocatable :: mpp_domains_stack(:)
-real(DOUBLE_KIND), allocatable :: mpp_domains_stack_nonblock(:)
+real(r8_kind), allocatable :: mpp_domains_stack(:)
+real(r8_kind), allocatable :: mpp_domains_stack_nonblock(:)
 !--- some dummy variables with dummy values that will never be used
 integer, parameter :: ptr_domains_stack = -999
 integer, parameter :: ptr_domains_stack_nonblock = -999
diff --git a/mpp/include/mpp_data_nocomm.inc b/mpp/include/mpp_data_nocomm.inc
index d513d59328..8969e479a8 100644
--- a/mpp/include/mpp_data_nocomm.inc
+++ b/mpp/include/mpp_data_nocomm.inc
@@ -22,7 +22,7 @@
 !----------------------------------------------------------------!
 ! The following data is used in mpp_mod and its components       !
 !----------------------------------------------------------------!
-real(DOUBLE_KIND), allocatable :: mpp_stack(:)
+real(r8_kind), allocatable :: mpp_stack(:)
 
 !--- some dummy variables with dummy values that will never be used
 integer, parameter :: stat=-999
@@ -35,8 +35,8 @@ integer, parameter :: mpp_from_pe = -999, ptr_from = -999
 !-------------------------------------------------------------------!
 ! The following data is used in mpp_domains_mod and its components  !
 !-------------------------------------------------------------------!
-real(DOUBLE_KIND), allocatable :: mpp_domains_stack(:)
-real(DOUBLE_KIND), allocatable :: mpp_domains_stack_nonblock(:)
+real(r8_kind), allocatable :: mpp_domains_stack(:)
+real(r8_kind), allocatable :: mpp_domains_stack_nonblock(:)
 !--- some dummy variables with dummy values that will never be used
 integer, parameter :: ptr_domains_stack = -999
 integer, parameter :: ptr_domains_stack_nonblock = -999
diff --git a/mpp/include/mpp_define_nest_domains.inc b/mpp/include/mpp_define_nest_domains.inc
index 13d8f2dd1f..0a25a8b9be 100644
--- a/mpp/include/mpp_define_nest_domains.inc
+++ b/mpp/include/mpp_define_nest_domains.inc
@@ -21,20 +21,88 @@
 !***********************************************************************
 
 !#############################################################################
-! Currently the contact will be limited to overlap contact.
+!> @brief Set up a domain to pass data between aligned coarse and fine grid of nested model.
+!> @detailed Set up a domain to pass data between aligned coarse and fine grid of a nested
+!! model. Supports multiple and telescoping nests. A telescoping nest is defined as
+!! a nest within a nest. Nest domains may span multiple tiles, but cannot contain a
+!! coarse-grid, cube corner. Concurrent nesting is the only supported mechanism,
+!! i.e. coarse and fine grid are on individual, non-overlapping, processor lists.
+!! Coarse and fine grid domain need to be defined before calling mpp_define_nest_domains.
+!! An mpp_broadcast is needed to broadcast both fine and coarse grid domain onto all processors.\n
+!!\n
+!! mpp_update_nest_coarse is used to pass data from fine grid to coarse grid computing domain.
+!! mpp_update_nest_fine is used to pass data from coarse grid to fine grid halo.
+!! You may call mpp_get_C2F_index before calling mpp_update_nest_fine to get the index for
+!! passing data from coarse to fine. You may call mpp_get_F2C_index before calling
+!! mpp_update_nest_coarse to get the index for passing data from coarse to fine.
+!
+!> @note: The following tests for nesting of regular lat-lon grids upon a cubed-sphere
+!! grid are done in test_mpp_domains:\n
+!!   a) a first-level nest spanning multiple cubed-sphere faces (tiles 1, 2, & 4)\n
+!!   b) a first-level nest wholly contained within tile 3\n
+!!   c) a second-level nest contained within the nest mentioned in a)\n
+!! Tests are done for data at T, E, C, N-cell center.\n
+!
+!> @example Below is an example to pass data between fine and coarse grid (More details on how to
+!! use the nesting domain update are available in routine test_update_nest_domain of
+!! test_fms/mpp/test_mpp_domains.F90.\n
+!!\n
+!! if( concurrent ) then\n
+!!   call mpp_broadcast_domain(domain_fine)\n
+!!   call mpp_broadcast_domain(domain_coarse)\n
+!! endif \n
+!! \n
+!! call mpp_define_nest_domains (nest_domain, domain, num_nest, nest_level(1:num_nest),  &\n
+!!                              tile_fine(1:num_nest), tile_coarse(1:num_nest), &\n
+!!                              istart_coarse(1:num_nest), icount_coarse(1:num_nest), &\n
+!!                              jstart_coarse(1:num_nest), jcount_coarse(1:num_nest), &\n
+!!                              npes_nest_tile, x_refine(1:num_nest), y_refine(1:num_nest), &\n
+!!                              extra_halo=extra_halo, name="nest_domain")\n
+!!\n
+!! call mpp_get_C2F_index(nest_domain, isw_f, iew_f, jsw_f, jew_f, isw_c, iew_c, jsw_c, jew_c, WEST, level)\n
+!! call mpp_get_C2F_index(nest_domain, ise_f, iee_f, jse_f, jee_f, ise_c, iee_c, jse_c, jee_c, EAST, level)\n
+!! call mpp_get_C2F_index(nest_domain, iss_f, ies_f, jss_f, jes_f, iss_c, ies_c, jss_c, jes_c, SOUTH, level)\n
+!! call mpp_get_C2F_index(nest_domain, isn_f, ien_f, jsn_f, jen_f, isn_c, ien_c, jsn_c, jen_c, NORTH, level)\n
+!!\n
+!!    allocate(wbuffer(isw_c:iew_c, jsw_c:jew_c,nz))\n
+!!    allocate(ebuffer(ise_c:iee_c, jse_c:jee_c,nz))\n
+!!    allocate(sbuffer(iss_c:ies_c, jss_c:jes_c,nz))\n
+!!    allocate(nbuffer(isn_c:ien_c, jsn_c:jen_c,nz))\n
+!!    call mpp_update_nest_fine(x, nest_domain, wbuffer, sbuffer, ebuffer, nbuffer)\n
+!!\n
+!!    call mpp_get_F2C_index(nest_domain, is_c, ie_c, js_c, je_c, is_f, ie_f, js_f, je_f, nest_level=level)\n
+!!    allocate(buffer (is_f:ie_f, js_f:je_f,nz))\n
+!!    call mpp_update_nest_coarse(x, nest_domain, buffer)
+!
+!> @example all mpp_define_nest_domains (nest_domain, domain, num_nest, nest_level, tile_fine, tile_coarse, &\n
+!!                               istart_coarse, icount_coarse, jstart_coarse, jcount_coarse, &\n
+!!                                    npes_nest_tile, x_refine, y_refine, extra_halo, name)
+!
+!> @note Currently the contact will be limited to overlap contact.
+
 subroutine mpp_define_nest_domains(nest_domain, domain, num_nest, nest_level, tile_fine, tile_coarse, &
                                   istart_coarse, icount_coarse, jstart_coarse, jcount_coarse, npes_nest_tile, &
                                   x_refine, y_refine, extra_halo, name)
-  type(nest_domain_type),     intent(inout) :: nest_domain
-  type(domain2D), target,     intent(in   ) :: domain
-  integer,                    intent(in   ) :: num_nest
-  integer,                    intent(in   ) :: nest_level(:)
-  integer,                    intent(in   ) :: tile_fine(:), tile_coarse(:)
-  integer,                    intent(in   ) :: istart_coarse(:), icount_coarse(:), jstart_coarse(:), jcount_coarse(:)
-  integer,                    intent(in   ) :: npes_nest_tile(:)
-  integer,                    intent(in   ) :: x_refine(:), y_refine(:)
-  integer,          optional, intent(in   ) :: extra_halo
-  character(len=*), optional, intent(in   ) :: name
+  type(nest_domain_type),     intent(inout) :: nest_domain !< holds the information to pass data
+                                                           !! between nest and parent grids.
+  type(domain2D), target,     intent(in   ) :: domain !< domain for the grid defined in the current pelist
+  integer,                    intent(in   ) :: num_nest !<  number of nests
+  integer,                    intent(in   ) :: nest_level(:) !< array containing the nest level for each nest
+                                                             !!(>1 implies a telescoping nest)
+  integer,                    intent(in   ) :: tile_fine(:), tile_coarse(:) !< array containing tile number of the
+                                              !! nest grid (monotonically increasing starting with 7),
+                                              !! array containing tile number of the parent grid corresponding
+                                              !! to the lower left corner of a given nest
+  integer,                    intent(in   ) :: istart_coarse(:), icount_coarse(:), jstart_coarse(:), jcount_coarse(:) !<start
+                                   !! array containing index in the parent grid of the lower left corner of a given
+                                   !! nest, count: array containing span of the nest on the parent grid
+  integer,                    intent(in   ) :: npes_nest_tile(:) !< array containing number of pes to allocated
+                                               !! to each defined tile
+  integer,                    intent(in   ) :: x_refine(:), y_refine(:) !< array containing refinement ratio
+                                               !! for each nest
+  integer,          optional, intent(in   ) :: extra_halo !< extra halo for passing data from coarse grid to fine grid.
+                                               !! default is 0 and currently only support extra_halo = 0.
+  character(len=*), optional, intent(in   ) :: name !< name of the nest domain
 
   logical                                   :: concurrent
   integer                                   :: n, l, m, my_tile_coarse
@@ -1442,14 +1510,26 @@ function search_C2F_nest_overlap(nest_domain, nest_level, extra_halo, position)
   end function search_F2C_nest_overlap
 
   !################################################################
+  !> @brief Get the index of the data passed from coarse grid to fine grid.
+  !> @detailed Get the index of the data passed from coarse grid to fine grid.
+  !> @example call mpp_get_C2F_index(nest_domain, is_fine, ie_fine, js_fine, je_fine,
+  !!                            is_coarse, ie_coarse, js_coarse, je_coarse, dir,
+  !!                            nest_level, position)
+  !
   subroutine mpp_get_C2F_index(nest_domain, is_fine, ie_fine, js_fine, je_fine, &
                 is_coarse, ie_coarse, js_coarse, je_coarse, dir, nest_level, position)
 
-     type(nest_domain_type), intent(in ) :: nest_domain
-     integer,                intent(out) :: is_fine, ie_fine, js_fine, je_fine
-     integer,                intent(out) :: is_coarse, ie_coarse, js_coarse, je_coarse
-     integer,                intent(in ) :: dir, nest_level
-     integer, optional,      intent(in ) :: position
+     type(nest_domain_type), intent(in ) :: nest_domain !< holds the information to pass data
+                                                        !! between fine and coarse grids
+     integer,                intent(out) :: is_fine, ie_fine, js_fine, je_fine !< index in the fine
+                                            !! grid of the nested region
+     integer,                intent(out) :: is_coarse, ie_coarse, js_coarse, je_coarse !< index in the coarse
+                                            !!  grid of the nested region
+     integer,                intent(in ) :: dir, nest_level !<  direction of the halo update.
+                                                            !! Its value should be WEST, EAST, SOUTH or NORTH.;
+                                                            !! level of the nest (> 1 implies a telescoping nest)
+     integer, optional,      intent(in ) :: position !< Cell position. It value should be CENTER,
+                                                     !! EAST, CORNER, or NORTH.
 
      integer                             :: update_position
      type(nestSpec), pointer             :: update => NULL()
@@ -1517,15 +1597,18 @@ function search_C2F_nest_overlap(nest_domain, nest_level, extra_halo, position)
 
   end subroutine mpp_get_C2F_index
 
-  !################################################################
   subroutine mpp_get_F2C_index_fine(nest_domain, is_coarse, ie_coarse, js_coarse, je_coarse, &
                 is_fine, ie_fine, js_fine, je_fine, nest_level, position)
 
-     type(nest_domain_type), intent(in ) :: nest_domain
-     integer,                intent(out) :: is_fine, ie_fine, js_fine, je_fine
-     integer,                intent(out) :: is_coarse, ie_coarse, js_coarse, je_coarse
-     integer,                intent(in)  :: nest_level
-     integer, optional,      intent(in ) :: position
+     type(nest_domain_type), intent(in ) :: nest_domain !< Holds the information to pass data
+                                                        !! between fine and coarse grid.
+     integer,                intent(out) :: is_fine, ie_fine, js_fine, je_fine !< index in the fine
+                                            !! grid of the nested region
+     integer,                intent(out) :: is_coarse, ie_coarse, js_coarse, je_coarse !< index in
+                                            !! the coarse grid of the nested region
+     integer,                intent(in)  :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+     integer, optional,      intent(in ) :: position !< Cell position. It value should be CENTER,
+                                                     !! EAST, CORNER, or NORTH.
 
      integer                             :: update_position
      type(nestSpec), pointer             :: update => NULL()
@@ -1565,10 +1648,13 @@ function search_C2F_nest_overlap(nest_domain, nest_level, extra_halo, position)
   !################################################################
   subroutine mpp_get_F2C_index_coarse(nest_domain, is_coarse, ie_coarse, js_coarse, je_coarse, nest_level, position)
 
-     type(nest_domain_type), intent(in ) :: nest_domain
-     integer,                intent(out) :: is_coarse, ie_coarse, js_coarse, je_coarse
-     integer,                intent(in ) :: nest_level
-     integer, optional,      intent(in ) :: position
+     type(nest_domain_type), intent(in ) :: nest_domain !< Holds the information to pass data
+                                                        !! between fine and coarse grid.
+     integer,                intent(out) :: is_coarse, ie_coarse, js_coarse, je_coarse !< index in the fine
+                                            !! grid of the nested region
+     integer,                intent(in ) :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+     integer, optional,      intent(in ) :: position !< Cell position. It value should be CENTER,
+                                                     !! EAST, CORNER, or NORTH.
 
      integer                             :: update_position
      type(nestSpec), pointer             :: update => NULL()
diff --git a/mpp/include/mpp_do_check.h b/mpp/include/mpp_do_check.h
index 362e6912f6..51b72cfccb 100644
--- a/mpp/include/mpp_do_check.h
+++ b/mpp/include/mpp_do_check.h
@@ -19,7 +19,7 @@
 !***********************************************************************
     subroutine MPP_DO_CHECK_3D_( f_addrs, domain, check, d_type, ke, flags, name)
 !updates data domain of 3D field whose computational domains have been computed
-      integer(LONG_KIND),         intent(in) :: f_addrs(:,:)
+      integer(i8_kind),         intent(in) :: f_addrs(:,:)
       type(domain2D),             intent(in) :: domain
       type(overlapSpec),          intent(in) :: check
       MPP_TYPE_,                  intent(in) :: d_type  ! creates unique interface
diff --git a/mpp/include/mpp_do_checkV.h b/mpp/include/mpp_do_checkV.h
index d07aed7bad..f3ba2e4402 100644
--- a/mpp/include/mpp_do_checkV.h
+++ b/mpp/include/mpp_do_checkV.h
@@ -20,7 +20,7 @@
     subroutine MPP_DO_CHECK_3D_V_(f_addrsx,f_addrsy, domain, check_x, check_y, &
                                    d_type, ke, flags, name)
 !updates data domain of 3D field whose computational domains have been computed
-      integer(LONG_KIND),  intent(in)        :: f_addrsx(:,:), f_addrsy(:,:)
+      integer(i8_kind),  intent(in)        :: f_addrsx(:,:), f_addrsy(:,:)
       type(domain2d),      intent(in)        :: domain
       type(overlapSpec),   intent(in)        :: check_x, check_y
       integer,             intent(in)        :: ke
diff --git a/mpp/include/mpp_do_get_boundary.h b/mpp/include/mpp_do_get_boundary.h
index cc5c5a2e22..f346e6e629 100644
--- a/mpp/include/mpp_do_get_boundary.h
+++ b/mpp/include/mpp_do_get_boundary.h
@@ -20,8 +20,8 @@
 subroutine MPP_DO_GET_BOUNDARY_3D_( f_addrs, domain, bound, b_addrs, bsize, ke, d_type)
   type(domain2D), intent(in)      :: domain
   type(overlapSpec),  intent(in)  :: bound
-  integer(LONG_KIND), intent(in)  :: f_addrs(:,:)
-  integer(LONG_KIND), intent(in)  :: b_addrs(:,:,:)
+  integer(i8_kind), intent(in)  :: f_addrs(:,:)
+  integer(i8_kind), intent(in)  :: b_addrs(:,:,:)
   integer,            intent(in)  :: bsize(:), ke
   MPP_TYPE_, intent(in)           :: d_type  ! creates unique interface
 
@@ -329,8 +329,8 @@ subroutine MPP_DO_GET_BOUNDARY_3D_V_(f_addrsx, f_addrsy, domain, boundx, boundy,
                                         bsizex, bsizey, ke, d_type, flags, gridtype)
   type(domain2D),     intent(in)  :: domain
   type(overlapSpec),  intent(in)  :: boundx, boundy
-  integer(LONG_KIND), intent(in)  :: f_addrsx(:,:), f_addrsy(:,:)
-  integer(LONG_KIND), intent(in)  :: b_addrsx(:,:,:), b_addrsy(:,:,:)
+  integer(i8_kind), intent(in)  :: f_addrsx(:,:), f_addrsy(:,:)
+  integer(i8_kind), intent(in)  :: b_addrsx(:,:,:), b_addrsy(:,:,:)
   integer,            intent(in)  :: bsizex(:), bsizey(:), ke
   MPP_TYPE_, intent(in)           :: d_type  ! creates unique interface
   integer, intent(in)             :: flags
diff --git a/mpp/include/mpp_do_get_boundary_ad.h b/mpp/include/mpp_do_get_boundary_ad.h
index 770f44e4c9..a784192162 100644
--- a/mpp/include/mpp_do_get_boundary_ad.h
+++ b/mpp/include/mpp_do_get_boundary_ad.h
@@ -23,8 +23,8 @@
 subroutine MPP_DO_GET_BOUNDARY_AD_3D_( f_addrs, domain, bound, b_addrs, bsize, ke, d_type)
   type(domain2D), intent(in)      :: domain
   type(overlapSpec),  intent(in)  :: bound
-  integer(LONG_KIND), intent(in)  :: f_addrs(:,:)
-  integer(LONG_KIND), intent(in)  :: b_addrs(:,:,:)
+  integer(i8_kind), intent(in)  :: f_addrs(:,:)
+  integer(i8_kind), intent(in)  :: b_addrs(:,:,:)
   integer,            intent(in)  :: bsize(:), ke
   MPP_TYPE_, intent(in)           :: d_type  ! creates unique interface
 
@@ -332,8 +332,8 @@ subroutine MPP_DO_GET_BOUNDARY_AD_3D_V_(f_addrsx, f_addrsy, domain, boundx, boun
                                         bsizex, bsizey, ke, d_type, flags, gridtype)
   type(domain2D),     intent(in)  :: domain
   type(overlapSpec),  intent(in)  :: boundx, boundy
-  integer(LONG_KIND), intent(in)  :: f_addrsx(:,:), f_addrsy(:,:)
-  integer(LONG_KIND), intent(in)  :: b_addrsx(:,:,:), b_addrsy(:,:,:)
+  integer(i8_kind), intent(in)  :: f_addrsx(:,:), f_addrsy(:,:)
+  integer(i8_kind), intent(in)  :: b_addrsx(:,:,:), b_addrsy(:,:,:)
   integer,            intent(in)  :: bsizex(:), bsizey(:), ke
   MPP_TYPE_, intent(in)           :: d_type  ! creates unique interface
   integer, intent(in)             :: flags
diff --git a/mpp/include/mpp_do_redistribute.h b/mpp/include/mpp_do_redistribute.h
index 9d576fbcdc..d238828e2d 100644
--- a/mpp/include/mpp_do_redistribute.h
+++ b/mpp/include/mpp_do_redistribute.h
@@ -17,7 +17,7 @@
 !* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
 !***********************************************************************
     subroutine MPP_DO_REDISTRIBUTE_3D_( f_in, f_out, d_comm, d_type )
-      integer(LONG_KIND), intent(in)         :: f_in(:), f_out(:)
+      integer(i8_kind), intent(in)         :: f_in(:), f_out(:)
       type(DomainCommunicator2D), intent(in) :: d_comm
       MPP_TYPE_, intent(in)                  :: d_type
       MPP_TYPE_ :: field_in(d_comm%domain_in%x(1)%data%begin:d_comm%domain_in%x(1)%data%end, &
diff --git a/mpp/include/mpp_do_update.h b/mpp/include/mpp_do_update.h
index cf0d1ca4d5..76d319b07e 100644
--- a/mpp/include/mpp_do_update.h
+++ b/mpp/include/mpp_do_update.h
@@ -19,7 +19,7 @@
 !***********************************************************************
     subroutine MPP_DO_UPDATE_3D_( f_addrs, domain, update, d_type, ke, flags)
 !updates data domain of 3D field whose computational domains have been computed
-      integer(LONG_KIND),         intent(in) :: f_addrs(:,:)
+      integer(i8_kind),         intent(in) :: f_addrs(:,:)
       type(domain2D),             intent(in) :: domain
       type(overlapSpec),          intent(in) :: update
       MPP_TYPE_,                  intent(in) :: d_type  ! creates unique interface
diff --git a/mpp/include/mpp_do_updateV.h b/mpp/include/mpp_do_updateV.h
index 970a484050..9c84cdd0d4 100644
--- a/mpp/include/mpp_do_updateV.h
+++ b/mpp/include/mpp_do_updateV.h
@@ -20,7 +20,7 @@
     subroutine MPP_DO_UPDATE_3D_V_(f_addrsx,f_addrsy, domain, update_x, update_y, &
                                    d_type, ke, gridtype, flags)
 !updates data domain of 3D field whose computational domains have been computed
-      integer(LONG_KIND),  intent(in)        :: f_addrsx(:,:), f_addrsy(:,:)
+      integer(i8_kind),  intent(in)        :: f_addrsx(:,:), f_addrsy(:,:)
       type(domain2d),      intent(in)        :: domain
       type(overlapSpec),   intent(in)        :: update_x, update_y
       integer,             intent(in)        :: ke
diff --git a/mpp/include/mpp_do_updateV_ad.h b/mpp/include/mpp_do_updateV_ad.h
index cbab7d6171..8f8bc476ec 100644
--- a/mpp/include/mpp_do_updateV_ad.h
+++ b/mpp/include/mpp_do_updateV_ad.h
@@ -23,7 +23,7 @@
     subroutine MPP_DO_UPDATE_AD_3D_V_(f_addrsx,f_addrsy, domain, update_x, update_y, &
                                    d_type, ke, gridtype, flags)
 !updates data domain of 3D field whose computational domains have been computed
-      integer(LONG_KIND),  intent(in)        :: f_addrsx(:,:), f_addrsy(:,:)
+      integer(i8_kind),  intent(in)        :: f_addrsx(:,:), f_addrsy(:,:)
       type(domain2d),      intent(in)        :: domain
       type(overlapSpec),   intent(in)        :: update_x, update_y
       integer,             intent(in)        :: ke
diff --git a/mpp/include/mpp_do_updateV_nonblock.h b/mpp/include/mpp_do_updateV_nonblock.h
index d8249b758f..a1c0ebf8e8 100644
--- a/mpp/include/mpp_do_updateV_nonblock.h
+++ b/mpp/include/mpp_do_updateV_nonblock.h
@@ -20,7 +20,7 @@
 subroutine MPP_START_DO_UPDATE_3D_V_(id_update, f_addrsx, f_addrsy, domain, update_x, update_y,     &
                                      d_type, ke_max, ke_list, gridtype, flags, reuse_id_update, name)
   integer,             intent(in) :: id_update
-  integer(LONG_KIND),  intent(in) :: f_addrsx(:,:), f_addrsy(:,:)
+  integer(i8_kind),  intent(in) :: f_addrsx(:,:), f_addrsy(:,:)
   type(domain2d),      intent(in) :: domain
   type(overlapSpec),   intent(in) :: update_x, update_y
   integer,             intent(in) :: ke_max
@@ -537,7 +537,7 @@ end subroutine MPP_START_DO_UPDATE_3D_V_
 subroutine MPP_COMPLETE_DO_UPDATE_3D_V_(id_update, f_addrsx, f_addrsy, domain, update_x, update_y,     &
                                         d_type, ke_max, ke_list, gridtype, flags)
   integer,             intent(in) :: id_update
-  integer(LONG_KIND),  intent(in) :: f_addrsx(:,:), f_addrsy(:,:)
+  integer(i8_kind),  intent(in) :: f_addrsx(:,:), f_addrsy(:,:)
   type(domain2d),      intent(in) :: domain
   type(overlapSpec),   intent(in) :: update_x, update_y
   integer,             intent(in) :: ke_max
diff --git a/mpp/include/mpp_do_update_ad.h b/mpp/include/mpp_do_update_ad.h
index d11c6769db..17b01cdd5a 100644
--- a/mpp/include/mpp_do_update_ad.h
+++ b/mpp/include/mpp_do_update_ad.h
@@ -22,7 +22,7 @@
 
     subroutine MPP_DO_UPDATE_AD_3D_( f_addrs, domain, update, d_type, ke, flags)
 !updates data domain of 3D field whose computational domains have been computed
-      integer(LONG_KIND),         intent(in) :: f_addrs(:,:)
+      integer(i8_kind),         intent(in) :: f_addrs(:,:)
       type(domain2D),             intent(in) :: domain
       type(overlapSpec),          intent(in) :: update
       MPP_TYPE_,                  intent(in) :: d_type  ! creates unique interface
diff --git a/mpp/include/mpp_do_update_nest.h b/mpp/include/mpp_do_update_nest.h
index fcbb36eb9b..48f5e6d333 100644
--- a/mpp/include/mpp_do_update_nest.h
+++ b/mpp/include/mpp_do_update_nest.h
@@ -19,15 +19,15 @@
 !***********************************************************************
 subroutine MPP_DO_UPDATE_NEST_FINE_3D_(f_addrs, nest_domain, update, d_type, ke, wb_addrs, eb_addrs, &
                                    sb_addrs, nb_addrs, flags, xbegin, xend, ybegin, yend)
-  integer(LONG_KIND),         intent(in) :: f_addrs(:)
+  integer(i8_kind),         intent(in) :: f_addrs(:)
   type(nest_level_type),      intent(in) :: nest_domain
   type(nestSpec),          intent(in) :: update
   MPP_TYPE_,                  intent(in) :: d_type  ! creates unique interface
   integer,                    intent(in) :: ke
-  integer(LONG_KIND),         intent(in) :: wb_addrs(:)
-  integer(LONG_KIND),         intent(in) :: eb_addrs(:)
-  integer(LONG_KIND),         intent(in) :: sb_addrs(:)
-  integer(LONG_KIND),         intent(in) :: nb_addrs(:)
+  integer(i8_kind),         intent(in) :: wb_addrs(:)
+  integer(i8_kind),         intent(in) :: eb_addrs(:)
+  integer(i8_kind),         intent(in) :: sb_addrs(:)
+  integer(i8_kind),         intent(in) :: nb_addrs(:)
   integer,                    intent(in) :: flags
   integer,                    intent(in) :: xbegin, xend, ybegin, yend
 
@@ -260,15 +260,15 @@ end subroutine MPP_DO_UPDATE_NEST_FINE_3D_
 #ifdef VECTOR_FIELD_
 subroutine MPP_DO_UPDATE_NEST_FINE_3D_V_(f_addrsx, f_addrsy, nest_domain, update_x, update_y, d_type, ke, wb_addrsx, wb_addrsy, &
                                    eb_addrsx, eb_addrsy, sb_addrsx, sb_addrsy, nb_addrsx, nb_addrsy, flags)
-  integer(LONG_KIND),         intent(in) :: f_addrsx(:), f_addrsy(:)
+  integer(i8_kind),         intent(in) :: f_addrsx(:), f_addrsy(:)
   type(nest_level_type),      intent(in) :: nest_domain
   type(nestSpec),             intent(in) :: update_x, update_y
   MPP_TYPE_,                  intent(in) :: d_type  ! creates unique interface
   integer,                    intent(in) :: ke
-  integer(LONG_KIND),         intent(in) :: wb_addrsx(:), wb_addrsy(:)
-  integer(LONG_KIND),         intent(in) :: eb_addrsx(:), eb_addrsy(:)
-  integer(LONG_KIND),         intent(in) :: sb_addrsx(:), sb_addrsy(:)
-  integer(LONG_KIND),         intent(in) :: nb_addrsx(:), nb_addrsy(:)
+  integer(i8_kind),         intent(in) :: wb_addrsx(:), wb_addrsy(:)
+  integer(i8_kind),         intent(in) :: eb_addrsx(:), eb_addrsy(:)
+  integer(i8_kind),         intent(in) :: sb_addrsx(:), sb_addrsy(:)
+  integer(i8_kind),         intent(in) :: nb_addrsx(:), nb_addrsy(:)
   integer,                    intent(in) :: flags
 
   character(len=8)            :: text
@@ -677,8 +677,8 @@ end subroutine MPP_DO_UPDATE_NEST_FINE_3D_V_
 
 !###############################################################################
 subroutine MPP_DO_UPDATE_NEST_COARSE_3D_(f_addrs_in, f_addrs_out, nest_domain, update, d_type, ke)
-  integer(LONG_KIND),         intent(in) :: f_addrs_in(:)
-  integer(LONG_KIND),         intent(in) :: f_addrs_out(:)
+  integer(i8_kind),         intent(in) :: f_addrs_in(:)
+  integer(i8_kind),         intent(in) :: f_addrs_out(:)
   type(nest_domain_type),     intent(in) :: nest_domain
   type(nestSpec),             intent(in) :: update
   MPP_TYPE_,                  intent(in) :: d_type  ! creates unique interface
@@ -854,8 +854,8 @@ end subroutine MPP_DO_UPDATE_NEST_COARSE_3D_
 !###############################################################################
 subroutine MPP_DO_UPDATE_NEST_COARSE_3D_V_(f_addrsx_in, f_addrsy_in, f_addrsx_out, f_addrsy_out, &
                                            nest_domain, nest, update_x, update_y, d_type, ke, flags)
-  integer(LONG_KIND),         intent(in) :: f_addrsx_in(:), f_addrsy_in(:)
-  integer(LONG_KIND),         intent(in) :: f_addrsx_out(:), f_addrsy_out(:)
+  integer(i8_kind),         intent(in) :: f_addrsx_in(:), f_addrsy_in(:)
+  integer(i8_kind),         intent(in) :: f_addrsx_out(:), f_addrsy_out(:)
   type(nest_domain_type),     intent(in) :: nest_domain
   type(nest_level_type),      intent(in) :: nest
   type(nestSpec),             intent(in) :: update_x, update_y
diff --git a/mpp/include/mpp_do_update_nonblock.h b/mpp/include/mpp_do_update_nonblock.h
index 1da02643e1..4006aa3383 100644
--- a/mpp/include/mpp_do_update_nonblock.h
+++ b/mpp/include/mpp_do_update_nonblock.h
@@ -19,7 +19,7 @@
 !***********************************************************************
 subroutine MPP_START_DO_UPDATE_3D_(id_update, f_addrs, domain, update, d_type, ke_max, ke_list, flags, reuse_id_update, name)
   integer,                    intent(in) :: id_update
-  integer(LONG_KIND),         intent(in) :: f_addrs(:,:)
+  integer(i8_kind),         intent(in) :: f_addrs(:,:)
   type(domain2D),             intent(in) :: domain
   type(overlapSpec),          intent(in) :: update
   MPP_TYPE_,                  intent(in) :: d_type  ! creates unique interface
@@ -256,7 +256,7 @@ end subroutine MPP_START_DO_UPDATE_3D_
 
 subroutine MPP_COMPLETE_DO_UPDATE_3D_(id_update, f_addrs, domain, update, d_type, ke_max, ke_list, flags)
   integer,             intent(in) :: id_update
-  integer(LONG_KIND),  intent(in) :: f_addrs(:,:)
+  integer(i8_kind),  intent(in) :: f_addrs(:,:)
   type(domain2d),      intent(in) :: domain
   type(overlapSpec),   intent(in) :: update
   integer,             intent(in) :: ke_max
diff --git a/mpp/include/mpp_domains_comm.inc b/mpp/include/mpp_domains_comm.inc
index 9be3862757..5bb258242b 100644
--- a/mpp/include/mpp_domains_comm.inc
+++ b/mpp/include/mpp_domains_comm.inc
@@ -24,9 +24,9 @@
                                         isize_in,jsize_in,ksize_in,isize_out,jsize_out,ksize_out) RESULT(d_comm)
       type(DomainCommunicator2D), pointer       :: d_comm
       type(domain2D),target,      intent(in)    :: domain_in
-      integer(LONG_KIND),         intent(in)    :: l_addrs_in(:)
+      integer(i8_kind),         intent(in)    :: l_addrs_in(:)
       type(domain2D),target,      intent(in)    :: domain_out
-      integer(LONG_KIND),         intent(in)    :: l_addrs_out(:)
+      integer(i8_kind),         intent(in)    :: l_addrs_out(:)
       integer,                    intent(in)    :: isize_in
       integer,                    intent(in)    :: jsize_in
       integer,                    intent(in)    :: ksize_in
@@ -34,13 +34,13 @@
       integer,                    intent(in)    :: jsize_out
       integer,                    intent(in)    :: ksize_out
 
-      integer(LONG_KIND) :: domain_id
+      integer(i8_kind) :: domain_id
       integer :: m, list
       integer :: is, ie, js, je, ke, ioff, joff, list_size
       integer :: isc, iec, jsc, jec, mytile
       integer :: lsize,rsize,msgsize,to_pe,from_pe
       integer,           allocatable,dimension(:) :: isL, jsL
-      integer(LONG_KIND),allocatable,dimension(:,:) :: slist_addr
+      integer(i8_kind),allocatable,dimension(:,:) :: slist_addr
       character(len=8) :: text
 
 
@@ -204,25 +204,25 @@
                                         jsize_l, ksize,l_addr2,flags, position) RESULT(d_comm)
       type(DomainCommunicator2D), pointer       :: d_comm
       type(domain2D),target,      intent(in)    :: domain
-      integer(LONG_KIND),         intent(in)    :: l_addr
+      integer(i8_kind),         intent(in)    :: l_addr
       integer,                    intent(in)    :: isize_g
       integer,                    intent(in)    :: jsize_g
       integer,                    intent(in)    :: isize_l
       integer,                    intent(in)    :: jsize_l
       integer,                    intent(in)    :: ksize
-      integer(LONG_KIND),optional,intent(in)    :: l_addr2
+      integer(i8_kind),optional,intent(in)    :: l_addr2
       integer, optional,          intent(in)    :: flags
       integer, optional,          intent(in)    :: position
 
-      integer(LONG_KIND) :: domain_id
+      integer(i8_kind) :: domain_id
       integer :: n, lpos, rpos, list, nlist, tile_id
       integer :: update_flags
       logical :: xonly, yonly
       integer :: is, ie, js, je, ioff, joff, ishift, jshift
       integer :: lsize,msgsize,from_pe
       integer,           allocatable,dimension(:) :: isL, jsL
-      integer(LONG_KIND),allocatable,dimension(:,:) :: slist_addr
-      integer(LONG_KIND),save       ,dimension(2)   :: rem_addr
+      integer(i8_kind),allocatable,dimension(:,:) :: slist_addr
+      integer(i8_kind),save       ,dimension(2)   :: rem_addr
       character(len=8) :: text
 
       if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: must first call mpp_domains_init.' )
@@ -427,12 +427,12 @@
     ! Since initialization of the d_comm type is expensive, freeing should be a rare
     ! event. Thus no attempt is made to salvage freed d_comm's.
       type(domain2D),             intent(in)    :: domain_in
-      integer(LONG_KIND),         intent(in)    :: l_addr
+      integer(i8_kind),         intent(in)    :: l_addr
       type(domain2D),             intent(in)    :: domain_out
-      integer(LONG_KIND),         intent(in)    :: l_addr2
+      integer(i8_kind),         intent(in)    :: l_addr2
       integer,                    intent(in)    :: ksize,lsize
 
-      integer(LONG_KIND) :: domain_id
+      integer(i8_kind) :: domain_id
 
       if(l_addr2 > 0)then
         domain_id = set_domain_id(domain_out%id,ksize+lsize)
@@ -447,13 +447,13 @@
     ! Since initialization of the d_comm type is expensive, freeing should be a rare
     ! event. Thus no attempt is made to salvage freed d_comm's.
       type(domain2D),             intent(in)    :: domain
-      integer(LONG_KIND),         intent(in)    :: l_addr
+      integer(i8_kind),         intent(in)    :: l_addr
       integer,                    intent(in)    :: ksize
-      integer(LONG_KIND),optional,intent(in)    :: l_addr2
+      integer(i8_kind),optional,intent(in)    :: l_addr2
       integer,           optional,intent(in)    :: flags
 
       integer :: update_flags
-      integer(LONG_KIND) :: domain_id
+      integer(i8_kind) :: domain_id
 
       update_flags=0; if(PRESENT(flags))update_flags=flags
       domain_id=set_domain_id(domain%id,ksize,update_flags)
@@ -464,23 +464,23 @@
     subroutine free_comm(domain_id,l_addr,l_addr2)
     ! Since initialization of the d_comm type is expensive, freeing should be a rare
     ! event. Thus no attempt is made to salvage freed d_comm's.
-      integer(LONG_KIND),         intent(in) :: domain_id
-      integer(LONG_KIND),         intent(in) :: l_addr
-      integer(LONG_KIND),optional,intent(in) :: l_addr2
+      integer(i8_kind),         intent(in) :: domain_id
+      integer(i8_kind),         intent(in) :: l_addr
+      integer(i8_kind),optional,intent(in) :: l_addr2
 
-      integer(LONG_KIND) :: dc_key,a_key
+      integer(i8_kind) :: dc_key,a_key
       integer :: dc_idx,a_idx,i_idx,insert,insert_a,insert_i
       integer :: a2_idx,insert_a2
 
 
       i_idx = find_key(domain_id,ids_sorted(1:n_ids),insert_i)
       a_idx = find_key(l_addr,addrs_sorted(1:a_sort_len),insert_a)
-      a_key = int(addrs_idx(a_idx),KIND(LONG_KIND))
+      a_key = int(addrs_idx(a_idx),KIND(i8_kind))
       if(PRESENT(l_addr2))then
         a2_idx = find_key(l_addr2,addrs2_sorted(1:a2_sort_len),insert_a2)
-        a_key = a_key + ADDR2_BASE*int(addrs2_idx(a2_idx),KIND(LONG_KIND))
+        a_key = a_key + ADDR2_BASE*int(addrs2_idx(a2_idx),KIND(i8_kind))
       endif
-      dc_key = DOMAIN_ID_BASE*int(ids_idx(i_idx),KIND(LONG_KIND)) + a_key
+      dc_key = DOMAIN_ID_BASE*int(ids_idx(i_idx),KIND(i8_kind)) + a_key
       dc_idx = find_key(dc_key,dcKey_sorted(1:dc_sort_len),insert)
 
       if(dc_idx < 0)then
@@ -494,24 +494,24 @@
 
 
     function get_comm(domain_id,l_addr,l_addr2)
-      integer(LONG_KIND),intent(in)       :: domain_id
-      integer(LONG_KIND),intent(in)       :: l_addr
-      integer(LONG_KIND),intent(in),optional :: l_addr2
+      integer(i8_kind),intent(in)       :: domain_id
+      integer(i8_kind),intent(in)       :: l_addr
+      integer(i8_kind),intent(in),optional :: l_addr2
       type(DomainCommunicator2D), pointer :: get_comm
 
-      integer(LONG_KIND) :: dc_key,a_key
+      integer(i8_kind) :: dc_key,a_key
       integer :: i,dc_idx,a_idx,i_idx,insert,insert_a,insert_i
       integer :: a2_idx,insert_a2
 
       if(.not.ALLOCATED(d_comm))ALLOCATE(d_comm(MAX_FIELDS))
       i_idx   = find_key(domain_id,ids_sorted(1:n_ids),insert_i)
       a_idx = find_key(l_addr,addrs_sorted(1:a_sort_len),insert_a)
-      a_key = int(addrs_idx(a_idx),KIND(LONG_KIND))
+      a_key = int(addrs_idx(a_idx),KIND(i8_kind))
       if(PRESENT(l_addr2))then
         a2_idx = find_key(l_addr2,addrs2_sorted(1:a2_sort_len),insert_a2)
-        a_key = a_key + ADDR2_BASE*int(addrs2_idx(a2_idx),KIND(LONG_KIND))
+        a_key = a_key + ADDR2_BASE*int(addrs2_idx(a2_idx),KIND(i8_kind))
       endif
-      dc_key   = DOMAIN_ID_BASE*int(ids_idx(i_idx),KIND(LONG_KIND)) + a_key
+      dc_key   = DOMAIN_ID_BASE*int(ids_idx(i_idx),KIND(i8_kind)) + a_key
       dc_idx   = find_key(dc_key,dcKey_sorted(1:dc_sort_len),insert)
       if(dc_idx > 0)then
         get_comm =>d_comm(d_comm_idx(dc_idx))
@@ -544,7 +544,7 @@
         endif
         a_key = int(addrs_idx(a_idx),KIND(8))
         if(PRESENT(l_addr2))a_key = a_key + ADDR2_BASE*int(addrs2_idx(a2_idx),KIND(8))
-        dc_key   = DOMAIN_ID_BASE*int(ids_idx(i_idx),KIND(LONG_KIND)) + a_key
+        dc_key   = DOMAIN_ID_BASE*int(ids_idx(i_idx),KIND(i8_kind)) + a_key
         dc_idx   = find_key(dc_key,dcKey_sorted(1:dc_sort_len),insert)
         if(dc_idx /= -1)call mpp_error(FATAL,'GET_COMM: attempt to insert existing key')
         n_comm = n_comm + 1
@@ -562,11 +562,11 @@
 
 
     function push_key(sorted,idx,n_idx,insert,key,ival)
-      integer(LONG_KIND),intent(inout),dimension(:)   :: sorted
+      integer(i8_kind),intent(inout),dimension(:)   :: sorted
       integer,           intent(inout),dimension(-1:) :: idx  ! Start -1 to simplify first call logic in get_comm
       integer,           intent(inout)                :: n_idx
       integer,           intent(in)                   :: insert
-      integer(LONG_KIND),intent(in)                   :: key
+      integer(i8_kind),intent(in)                   :: key
       integer,           intent(in)                   :: ival
 
       integer                                         :: push_key,i
@@ -583,7 +583,7 @@
 
 
     subroutine pop_key(sorted,idx,n_idx,key_idx)
-      integer(LONG_KIND),intent(inout),dimension(:)   :: sorted
+      integer(i8_kind),intent(inout),dimension(:)   :: sorted
       integer,           intent(inout),dimension(-1:) :: idx  ! Start -1 to simplify first call logic in get_comm
       integer,           intent(inout)                :: n_idx
       integer,           intent(in)                   :: key_idx
@@ -602,8 +602,8 @@
 
     function find_key(key,sorted,insert) RESULT(n)
     ! The algorithm used here requires monotonic keys w/out repetition.
-      integer(LONG_KIND),intent(in)              :: key        ! new address to be found in list
-      integer(LONG_KIND),dimension(:),intent(in) :: sorted  ! list of sorted local addrs
+      integer(i8_kind),intent(in)              :: key        ! new address to be found in list
+      integer(i8_kind),dimension(:),intent(in) :: sorted  ! list of sorted local addrs
       integer,                        intent(out) :: insert
       integer :: n, n_max, n_min, n_key
       logical :: not_found
@@ -696,18 +696,18 @@
 
 
     function set_domain_id(d_id,ksize,flags,gtype, position, whalo, ehalo, shalo, nhalo)
-      integer(LONG_KIND), intent(in) :: d_id
+      integer(i8_kind), intent(in) :: d_id
       integer           , intent(in) :: ksize
       integer           , optional, intent(in) :: flags
       integer           , optional, intent(in) :: gtype
       integer           , optional, intent(in) :: position
       integer           , optional, intent(in) :: whalo, ehalo, shalo, nhalo
 
-      integer(LONG_KIND)             :: set_domain_id
+      integer(i8_kind)             :: set_domain_id
 
       set_domain_id=d_id + KE_BASE*int(ksize,KIND(d_id))
       if(PRESENT(flags))set_domain_id=set_domain_id+int(flags,KIND(d_id))
-      if(PRESENT(gtype))set_domain_id=set_domain_id+GT_BASE*int(gtype,KIND(d_id))  ! Must be LONG_KIND arithmetic
+      if(PRESENT(gtype))set_domain_id=set_domain_id+GT_BASE*int(gtype,KIND(d_id))  ! Must be i8_kind arithmetic
       !--- gtype is never been used to set id. we need to add position to calculate id to seperate
       !--- BGRID and CGRID or scalar variable.
       if(present(position)) set_domain_id=set_domain_id+GT_BASE*int(2**position, KIND(d_id))
diff --git a/mpp/include/mpp_domains_define.inc b/mpp/include/mpp_domains_define.inc
index 2d4d7035ea..16bdeb534e 100644
--- a/mpp/include/mpp_domains_define.inc
+++ b/mpp/include/mpp_domains_define.inc
@@ -26,8 +26,8 @@
   !  <OUT NAME="layout" TYPE="integer" DIM="(2)"></OUT>
   ! </SUBROUTINE>
   subroutine mpp_define_layout2D( global_indices, ndivs, layout )
-    integer, intent(in) :: global_indices(:) !(/ isg, ieg, jsg, jeg /)
-    integer, intent(in) :: ndivs !number of divisions to divide global domain
+    integer, intent(in) :: global_indices(:) !< (/ isg, ieg, jsg, jeg /); Defines the global domain.
+    integer, intent(in) :: ndivs !< number of divisions to divide global domain
     integer, intent(out) :: layout(:)
 
     integer :: isg, ieg, jsg, jeg, isz, jsz, idiv, jdiv
@@ -276,43 +276,31 @@
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   !                                                                             !
-  !              MPP_DEFINE_DOMAINS: define layout and decomposition            !
+  !>@brief MPP_DEFINE_DOMAINS: define layout and decomposition                  !
   !                                                                             !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-  ! <SUBROUTINE NAME="mpp_define_domains1D" INTERFACE="mpp_define_domains">>
-  !   <IN NAME="global_indices" TYPE="integer" DIM="(2)"> </IN>
-  !   <IN NAME="ndivs" TYPE="integer">  </IN>
-  !   <INOUT NAME="domain" TYPE="type(domain1D)"> </INOUT>
-  !   <IN NAME="pelist" TYPE="integer" DIM="(0:)">  </IN>
-  !   <IN NAME="flags" TYPE="integer">  </IN>
-  !   <IN NAME="halo" TYPE="integer">  </IN>
-  !   <IN NAME="extent" TYPE="integer" DIM="(0:)">  </IN>
-  !   <IN NAME="maskmap" TYPE="logical" DIM="(0:)"> </IN>
-  ! </SUBROUTINE>
-    !routine to divide global array indices among domains, and assign domains to PEs
-    !domain is of type domain1D
-    !ARGUMENTS:
-    !      global_indices(2)=(isg,ieg) gives the extent of global domain
-    !      ndivs is number of divisions of domain: even divisions unless extent is present.
-    !      domain is the returned domain1D
-    !      pelist (optional) list of PEs to which domains are to be assigned (default 0...npes-1)
-    !                 size of pelist must correspond to number of mask=.TRUE. divisions
-    !      flags define whether compute and data domains are global (undecomposed) and whether global domain has periodic boundaries
-    !      halo (optional) defines halo width (currently the same on both sides)
-    !      extent (optional) array defines width of each division (used for non-uniform domain decomp, for e.g load-balancing)
-    !      maskmap (optional) a division whose maskmap=.FALSE. is not assigned to any domain
-    !  By default we assume decomposition of compute and data domains, non-periodic boundaries, no halo, as close to uniform extents
-    !  as the input parameters permit
+  !> @detailed routine to divide global array indices among domains, and assign domains to PEs
   subroutine mpp_define_domains1D( global_indices, ndivs, domain, pelist, flags, halo, extent, maskmap, &
                                    memory_size, begin_halo, end_halo )
-    integer,           intent(in) :: global_indices(:) !(/ isg, ieg /)
-    integer,           intent(in) :: ndivs
-    type(domain1D), intent(inout) :: domain !declared inout so that existing links, if any, can be nullified
-    integer, intent(in), optional :: pelist(0:)
-    integer, intent(in), optional :: flags, halo
-    integer, intent(in), optional :: extent(0:)
-    logical, intent(in), optional :: maskmap(0:)
+    integer,           intent(in) :: global_indices(:) !< (/ isg, ieg /) gives the extent of global domain
+    integer,           intent(in) :: ndivs !< number of divisions of domain: even divisions unless extent is present.
+    type(domain1D), intent(inout) :: domain !< the returned domain1D; declared inout so that
+                                            !! existing links, if any, can be nullified
+    integer, intent(in), optional :: pelist(0:) !< list of PEs to which domains are to be assigned
+                                                !! (default 0...npes-1); size of pelist must
+                                                !! correspond to number of mask=.TRUE. divisions
+    integer, intent(in), optional :: flags, halo !< flags define whether compute and data domains
+                                                 !! are global (undecomposed) and whether the global
+                                                 !! domain has periodic boundaries.
+                                                 !! halo defines halo width (currently the same on both sides)
+    integer, intent(in), optional :: extent(0:) !< array extent; defines width of each division
+                                                !! (used for non-uniform domain decomp, for e.g load-balancing)
+    logical, intent(in), optional :: maskmap(0:) !< a division whose maskmap=.FALSE. is not
+                                                 !! assigned to any domain. By default we assume
+                                                 !! decomposition of compute and data domains, non-periodic boundaries,
+                                                 !! no halo, as close to uniform extents as the
+                                                 !! input parameters permit
     integer, intent(in), optional :: memory_size
     integer, intent(in), optional :: begin_halo, end_halo
 
@@ -874,8 +862,8 @@
 
        domain%pe  = mpp_pe()
        domain%pos  = pos
-       domain_cnt = domain_cnt + INT(1,KIND=LONG_KIND)
-       domain%id = domain_cnt*DOMAIN_ID_BASE  ! Must be LONG_KIND arithmetic
+       domain_cnt = domain_cnt + INT(1,KIND=i8_kind)
+       domain%id = domain_cnt*DOMAIN_ID_BASE  ! Must be i8_kind arithmetic
 
        !do domain decomposition using 1D versions in X and Y,
        call mpp_define_domains( global_indices(1:2), ndivx, domain%x(tile), &
@@ -7449,11 +7437,13 @@ end subroutine check_alignment
 ! <PUBLICROUTINE>
 subroutine mpp_modify_domain1D(domain_in,domain_out,cbegin,cend,gbegin,gend, hbegin, hend)
   ! </PUBLICROUTINE>
-type(domain1D), intent(in)    :: domain_in
-type(domain1D), intent(inout) :: domain_out
-integer, intent(in), optional :: hbegin, hend             ! halo size
-integer, intent(in), optional :: cbegin, cend             ! extent of compute_domain
-integer, intent(in), optional :: gbegin, gend             ! extent of global domain
+type(domain1D), intent(in)    :: domain_in !< The source domain.
+type(domain1D), intent(inout) :: domain_out !< The returned domain.
+integer, intent(in), optional :: hbegin, hend !< halo size
+integer, intent(in), optional :: cbegin, cend !< Axis specifications associated with the compute
+                                              !! domain of the returned 1D domain.
+integer, intent(in), optional :: gbegin, gend !< Axis specifications associated with the global
+                                              !! domain of the returned 1D domain.
 integer :: ndivs, global_indices(2) !(/ isg, ieg /)
 integer                       :: flag
 ! get the global indices of the input domain
@@ -7495,11 +7485,13 @@ end subroutine mpp_modify_domain1D
 ! <PUBLICROUTINE>
 subroutine mpp_modify_domain2D(domain_in, domain_out, isc, iec, jsc, jec, isg, ieg, jsg, jeg, whalo, ehalo, shalo, nhalo)
   ! </PUBLICROUTINE>
-type(domain2D), intent(in)    :: domain_in
-type(domain2D), intent(inout) :: domain_out
-integer, intent(in), optional :: isc, iec, jsc, jec
-integer, intent(in), optional :: isg, ieg, jsg, jeg
-integer, intent(in), optional :: whalo, ehalo, shalo, nhalo
+type(domain2D), intent(in)    :: domain_in !< The source domain.
+type(domain2D), intent(inout) :: domain_out !< The returned domain.
+integer, intent(in), optional :: isc, iec, jsc, jec !< Zonal and meridional axis specifications
+                                 !! associated with the global domain of the returned 2D domain.
+integer, intent(in), optional :: isg, ieg, jsg, jeg !<  Zonal axis specifications associated with
+                                 !! the global domain of the returned 2D domain.
+integer, intent(in), optional :: whalo, ehalo, shalo, nhalo !< halo size in x- and y- directions
 integer                       :: global_indices(4), layout(2)
 integer                       :: xflag, yflag, nlist, i
 
diff --git a/mpp/include/mpp_domains_misc.inc b/mpp/include/mpp_domains_misc.inc
index 96b41afe60..69899a3b29 100644
--- a/mpp/include/mpp_domains_misc.inc
+++ b/mpp/include/mpp_domains_misc.inc
@@ -968,7 +968,7 @@ end subroutine init_nonblock_type
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_UPDATE_DOMAINS_2D_
 #define MPP_UPDATE_DOMAINS_2D_ mpp_update_domain2D_r8_2D
 #undef MPP_UPDATE_DOMAINS_3D_
@@ -1000,7 +1000,7 @@ end subroutine init_nonblock_type
 
 #ifdef OVERLOAD_C8
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_UPDATE_DOMAINS_2D_
 #define MPP_UPDATE_DOMAINS_2D_ mpp_update_domain2D_c8_2D
 #undef MPP_UPDATE_DOMAINS_3D_
@@ -1020,9 +1020,8 @@ end subroutine init_nonblock_type
 #include <mpp_update_domains2D.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_UPDATE_DOMAINS_2D_
 #define MPP_UPDATE_DOMAINS_2D_ mpp_update_domain2D_i8_2D
 #undef MPP_UPDATE_DOMAINS_3D_
@@ -1040,13 +1039,11 @@ end subroutine init_nonblock_type
 #undef MPP_REDISTRIBUTE_5D_
 #define MPP_REDISTRIBUTE_5D_ mpp_redistribute_i8_5D
 #include <mpp_update_domains2D.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_UPDATE_DOMAINS_2D_
 #define MPP_UPDATE_DOMAINS_2D_ mpp_update_domain2D_r4_2D
 #undef MPP_UPDATE_DOMAINS_3D_
@@ -1064,7 +1061,6 @@ end subroutine init_nonblock_type
 #define MPP_UPDATE_DOMAINS_4D_V_ mpp_update_domain2D_r4_4Dv
 #undef MPP_UPDATE_DOMAINS_5D_V_
 #define MPP_UPDATE_DOMAINS_5D_V_ mpp_update_domain2D_r4_5Dv
-#endif
 #undef MPP_REDISTRIBUTE_2D_
 #define MPP_REDISTRIBUTE_2D_ mpp_redistribute_r4_2D
 #undef MPP_REDISTRIBUTE_3D_
@@ -1079,7 +1075,7 @@ end subroutine init_nonblock_type
 
 #ifdef OVERLOAD_C4
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_UPDATE_DOMAINS_2D_
 #define MPP_UPDATE_DOMAINS_2D_ mpp_update_domain2D_c4_2D
 #undef MPP_UPDATE_DOMAINS_3D_
@@ -1100,7 +1096,7 @@ end subroutine init_nonblock_type
 #endif
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_UPDATE_DOMAINS_2D_
 #define MPP_UPDATE_DOMAINS_2D_ mpp_update_domain2D_i4_2D
 #undef MPP_UPDATE_DOMAINS_3D_
@@ -1130,7 +1126,7 @@ end subroutine init_nonblock_type
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_START_UPDATE_DOMAINS_2D_
 #define MPP_START_UPDATE_DOMAINS_2D_ mpp_start_update_domain2D_r8_2D
 #undef MPP_START_UPDATE_DOMAINS_3D_
@@ -1170,7 +1166,7 @@ end subroutine init_nonblock_type
 #ifdef OVERLOAD_C8
 #undef  VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_START_UPDATE_DOMAINS_2D_
 #define MPP_START_UPDATE_DOMAINS_2D_ mpp_start_update_domain2D_c8_2D
 #undef MPP_START_UPDATE_DOMAINS_3D_
@@ -1190,10 +1186,9 @@ end subroutine init_nonblock_type
 #include <mpp_update_domains2D_nonblock.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef  VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_START_UPDATE_DOMAINS_2D_
 #define MPP_START_UPDATE_DOMAINS_2D_ mpp_start_update_domain2D_i8_2D
 #undef MPP_START_UPDATE_DOMAINS_3D_
@@ -1211,13 +1206,11 @@ end subroutine init_nonblock_type
 #undef MPP_COMPLETE_UPDATE_DOMAINS_5D_
 #define MPP_COMPLETE_UPDATE_DOMAINS_5D_ mpp_complete_update_domain2D_i8_5D
 #include <mpp_update_domains2D_nonblock.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_START_UPDATE_DOMAINS_2D_
 #define MPP_START_UPDATE_DOMAINS_2D_ mpp_start_update_domain2D_r4_2D
 #undef MPP_START_UPDATE_DOMAINS_3D_
@@ -1253,12 +1246,11 @@ end subroutine init_nonblock_type
 #define MPP_COMPLETE_UPDATE_DOMAINS_5D_V_ mpp_complete_update_domain2D_r4_5Dv
 #endif
 #include <mpp_update_domains2D_nonblock.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_START_UPDATE_DOMAINS_2D_
 #define MPP_START_UPDATE_DOMAINS_2D_ mpp_start_update_domain2D_c4_2D
 #undef MPP_START_UPDATE_DOMAINS_3D_
@@ -1280,7 +1272,7 @@ end subroutine init_nonblock_type
 
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_START_UPDATE_DOMAINS_2D_
 #define MPP_START_UPDATE_DOMAINS_2D_ mpp_start_update_domain2D_i4_2D
 #undef MPP_START_UPDATE_DOMAINS_3D_
@@ -1307,7 +1299,7 @@ end subroutine init_nonblock_type
 !                                                                   !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL8
 #undef MPP_START_DO_UPDATE_3D_
@@ -1323,7 +1315,7 @@ end subroutine init_nonblock_type
 
 #ifdef OVERLOAD_C8
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_DOUBLE_COMPLEX
 #undef MPP_START_DO_UPDATE_3D_
@@ -1333,9 +1325,8 @@ end subroutine init_nonblock_type
 #include <mpp_do_update_nonblock.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER8
 #undef MPP_START_DO_UPDATE_3D_
@@ -1343,11 +1334,9 @@ end subroutine init_nonblock_type
 #undef MPP_COMPLETE_DO_UPDATE_3D_
 #define MPP_COMPLETE_DO_UPDATE_3D_ mpp_complete_do_update_i8_3D
 #include <mpp_do_update_nonblock.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL4
 #undef MPP_START_DO_UPDATE_3D_
@@ -1360,11 +1349,10 @@ end subroutine init_nonblock_type
 #define MPP_COMPLETE_DO_UPDATE_3D_V_ mpp_complete_do_update_r4_3Dv
 #include <mpp_do_update_nonblock.h>
 #include <mpp_do_updateV_nonblock.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_COMPLEX
 #undef MPP_START_DO_UPDATE_3D_
@@ -1375,7 +1363,7 @@ end subroutine init_nonblock_type
 #endif
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_INTEGER4
 #undef MPP_START_DO_UPDATE_3D_
@@ -1388,7 +1376,7 @@ end subroutine init_nonblock_type
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_DO_UPDATE_3D_
 #define MPP_DO_UPDATE_3D_ mpp_do_update_r8_3d
 #ifdef  VECTOR_FIELD_
@@ -1401,26 +1389,23 @@ end subroutine init_nonblock_type
 #ifdef OVERLOAD_C8
 #undef  VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_DO_UPDATE_3D_
 #define MPP_DO_UPDATE_3D_ mpp_do_update_c8_3d
 #include <mpp_do_update.h>
 #define VECTOR_FIELD_
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_DO_UPDATE_3D_
 #define MPP_DO_UPDATE_3D_ mpp_do_update_i8_3d
 #include <mpp_do_update.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_DO_UPDATE_3D_
 #define MPP_DO_UPDATE_3D_ mpp_do_update_r4_3d
 #ifdef  VECTOR_FIELD_
@@ -1429,12 +1414,11 @@ end subroutine init_nonblock_type
 #endif
 #include <mpp_do_update.h>
 #include <mpp_do_updateV.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef  VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_DO_UPDATE_3D_
 #define MPP_DO_UPDATE_3D_ mpp_do_update_c4_3d
 #include <mpp_do_update.h>
@@ -1442,14 +1426,14 @@ end subroutine init_nonblock_type
 #endif
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_DO_UPDATE_3D_
 #define MPP_DO_UPDATE_3D_ mpp_do_update_i4_3d
 #include <mpp_do_update.h>
 
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_DO_CHECK_3D_
 #define MPP_DO_CHECK_3D_ mpp_do_check_r8_3d
 #ifdef  VECTOR_FIELD_
@@ -1462,26 +1446,23 @@ end subroutine init_nonblock_type
 #ifdef OVERLOAD_C8
 #undef  VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_DO_CHECK_3D_
 #define MPP_DO_CHECK_3D_ mpp_do_check_c8_3d
 #include <mpp_do_check.h>
 #define VECTOR_FIELD_
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_DO_CHECK_3D_
 #define MPP_DO_CHECK_3D_ mpp_do_check_i8_3d
 #include <mpp_do_check.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_DO_CHECK_3D_
 #define MPP_DO_CHECK_3D_ mpp_do_check_r4_3d
 #ifdef  VECTOR_FIELD_
@@ -1490,19 +1471,18 @@ end subroutine init_nonblock_type
 #endif
 #include <mpp_do_check.h>
 #include <mpp_do_checkV.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef  VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_DO_CHECK_3D_
 #define MPP_DO_CHECK_3D_ mpp_do_check_c4_3d
 #include <mpp_do_check.h>
 #endif
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_DO_CHECK_3D_
 #define MPP_DO_CHECK_3D_ mpp_do_check_i4_3d
 #include <mpp_do_check.h>
@@ -1510,7 +1490,7 @@ end subroutine init_nonblock_type
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_UPDATE_NEST_FINE_2D_
 #define MPP_UPDATE_NEST_FINE_2D_ mpp_update_nest_fine_r8_2D
 #undef MPP_UPDATE_NEST_FINE_3D_
@@ -1540,7 +1520,7 @@ end subroutine init_nonblock_type
 #ifdef OVERLOAD_C8
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_UPDATE_NEST_FINE_2D_
 #define MPP_UPDATE_NEST_FINE_2D_ mpp_update_nest_fine_c8_2D
 #undef MPP_UPDATE_NEST_FINE_3D_
@@ -1556,10 +1536,9 @@ end subroutine init_nonblock_type
 #include <mpp_update_nest_domains.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_UPDATE_NEST_FINE_2D_
 #define MPP_UPDATE_NEST_FINE_2D_ mpp_update_nest_fine_i8_2D
 #undef MPP_UPDATE_NEST_FINE_3D_
@@ -1573,13 +1552,11 @@ end subroutine init_nonblock_type
 #undef MPP_UPDATE_NEST_COARSE_4D_
 #define MPP_UPDATE_NEST_COARSE_4D_ mpp_update_nest_coarse_i8_4D
 #include <mpp_update_nest_domains.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_UPDATE_NEST_FINE_2D_
 #define MPP_UPDATE_NEST_FINE_2D_ mpp_update_nest_fine_r4_2D
 #undef MPP_UPDATE_NEST_FINE_3D_
@@ -1605,12 +1582,11 @@ end subroutine init_nonblock_type
 #undef MPP_UPDATE_NEST_COARSE_4D_V_
 #define MPP_UPDATE_NEST_COARSE_4D_V_ mpp_update_nest_coarse_r4_4Dv
 #include <mpp_update_nest_domains.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_UPDATE_NEST_FINE_2D_
 #define MPP_UPDATE_NEST_FINE_2D_ mpp_update_nest_fine_c4_2D
 #undef MPP_UPDATE_NEST_FINE_3D_
@@ -1628,7 +1604,7 @@ end subroutine init_nonblock_type
 
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_UPDATE_NEST_FINE_2D_
 #define MPP_UPDATE_NEST_FINE_2D_ mpp_update_nest_fine_i4_2D
 #undef MPP_UPDATE_NEST_FINE_3D_
@@ -1646,7 +1622,7 @@ end subroutine init_nonblock_type
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_DO_UPDATE_NEST_FINE_3D_
 #define MPP_DO_UPDATE_NEST_FINE_3D_ mpp_do_update_nest_fine_r8_3D
 #undef MPP_DO_UPDATE_NEST_FINE_3D_V_
@@ -1660,7 +1636,7 @@ end subroutine init_nonblock_type
 #ifdef OVERLOAD_C8
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_DO_UPDATE_NEST_FINE_3D_
 #define MPP_DO_UPDATE_NEST_FINE_3D_ mpp_do_update_nest_fine_c8_3D
 #undef MPP_DO_UPDATE_NEST_COARSE_3D_
@@ -1668,22 +1644,19 @@ end subroutine init_nonblock_type
 #include <mpp_do_update_nest.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_DO_UPDATE_NEST_FINE_3D_
 #define MPP_DO_UPDATE_NEST_FINE_3D_ mpp_do_update_nest_fine_i8_3D
 #undef MPP_DO_UPDATE_NEST_COARSE_3D_
 #define MPP_DO_UPDATE_NEST_COARSE_3D_ mpp_do_update_nest_coarse_i8_3D
 #include <mpp_do_update_nest.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_DO_UPDATE_NEST_FINE_3D_
 #define MPP_DO_UPDATE_NEST_FINE_3D_ mpp_do_update_nest_fine_r4_3D
 #undef MPP_DO_UPDATE_NEST_FINE_3D_V_
@@ -1693,12 +1666,11 @@ end subroutine init_nonblock_type
 #undef MPP_DO_UPDATE_NEST_COARSE_3D_V_
 #define MPP_DO_UPDATE_NEST_COARSE_3D_V_ mpp_do_update_nest_coarse_r4_3Dv
 #include <mpp_do_update_nest.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_DO_UPDATE_NEST_FINE_3D_
 #define MPP_DO_UPDATE_NEST_FINE_3D_ mpp_do_update_nest_fine_c4_3D
 #undef MPP_DO_UPDATE_NEST_COARSE_3D_
@@ -1708,7 +1680,7 @@ end subroutine init_nonblock_type
 
 #undef VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_DO_UPDATE_NEST_FINE_3D_
 #define MPP_DO_UPDATE_NEST_FINE_3D_ mpp_do_update_nest_fine_i4_3D
 #undef MPP_DO_UPDATE_NEST_COARSE_3D_
@@ -1724,7 +1696,7 @@ end subroutine init_nonblock_type
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_UPDATE_DOMAINS_AD_2D_
 #define MPP_UPDATE_DOMAINS_AD_2D_ mpp_update_domains_ad_2D_r8_2D
 #undef MPP_UPDATE_DOMAINS_AD_3D_
@@ -1745,11 +1717,10 @@ end subroutine init_nonblock_type
 #endif
 #include <mpp_update_domains2D_ad.h>
 
-#ifdef OVERLOAD_R4
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_UPDATE_DOMAINS_AD_2D_
 #define MPP_UPDATE_DOMAINS_AD_2D_ mpp_update_domains_ad_2D_r4_2D
 #undef MPP_UPDATE_DOMAINS_AD_3D_
@@ -1769,13 +1740,12 @@ end subroutine init_nonblock_type
 #define MPP_UPDATE_DOMAINS_AD_5D_V_ mpp_update_domains_ad_2D_r4_5Dv
 #endif
 #include <mpp_update_domains2D_ad.h>
-#endif
 
 !*******************************************************
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_DO_UPDATE_AD_3D_
 #define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_r8_3d
 #ifdef  VECTOR_FIELD_
@@ -1788,26 +1758,23 @@ end subroutine init_nonblock_type
 #ifdef OVERLOAD_C8
 #undef  VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_DO_UPDATE_AD_3D_
 #define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_c8_3d
 #include <mpp_do_update_ad.h>
 #define VECTOR_FIELD_
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_DO_UPDATE_AD_3D_
 #define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_i8_3d
 #include <mpp_do_update_ad.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef VECTOR_FIELD_
 #define VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_DO_UPDATE_AD_3D_
 #define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_r4_3d
 #ifdef  VECTOR_FIELD_
@@ -1816,12 +1783,11 @@ end subroutine init_nonblock_type
 #endif
 #include <mpp_do_update_ad.h>
 #include <mpp_do_updateV_ad.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef  VECTOR_FIELD_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_DO_UPDATE_AD_3D_
 #define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_c4_3d
 #include <mpp_do_update_ad.h>
@@ -1829,14 +1795,14 @@ end subroutine init_nonblock_type
 #endif
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_DO_UPDATE_AD_3D_
 #define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_i4_3d
 #include <mpp_do_update_ad.h>
 !!$#undef VECTOR_FIELD_
 !!$#define VECTOR_FIELD_
 !!$#undef MPP_TYPE_
-!!$#define MPP_TYPE_ real(DOUBLE_KIND)
+!!$#define MPP_TYPE_ real(r8_kind)
 !!$#undef MPP_DO_UPDATE_AD_3D_
 !!$#define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_r8_3d
 !!$#ifdef  VECTOR_FIELD_
@@ -1849,25 +1815,25 @@ end subroutine init_nonblock_type
 !!$
 !!$#ifdef OVERLOAD_C8
 !!$#undef MPP_TYPE_
-!!$#define MPP_TYPE_ complex(DOUBLE_KIND)
+!!$#define MPP_TYPE_ complex(c8_kind)
 !!$#undef MPP_DO_UPDATE_AD_3D_
 !!$#define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_c8_3d
 !!$#include <mpp_do_update_ad.h>
 !!$#endif
 !!$
-!!$#ifndef no_8byte_integers
+!!$
 !!$#undef MPP_TYPE_
-!!$#define MPP_TYPE_ integer(LONG_KIND)
+!!$#define MPP_TYPE_ integer(i8_kind)
 !!$#undef MPP_DO_UPDATE_AD_3D_
 !!$#define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_i8_3d
 !!$#include <mpp_do_update_ad.h>
-!!$#endif
+!!$
 !!$
 !!$#ifdef OVERLOAD_R4
 !!$#undef  VECTOR_FIELD_
 !!$#define VECTOR_FIELD_
 !!$#undef MPP_TYPE_
-!!$#define MPP_TYPE_ real(FLOAT_KIND)
+!!$#define MPP_TYPE_ real(r4_kind)
 !!$#undef MPP_DO_UPDATE_AD_3D_
 !!$#define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_r4_3d
 !!$#ifdef  VECTOR_FIELD_
@@ -1881,14 +1847,14 @@ end subroutine init_nonblock_type
 !!$#ifdef OVERLOAD_C4
 !!$#undef  VECTOR_FIELD_
 !!$#undef MPP_TYPE_
-!!$#define MPP_TYPE_ complex(FLOAT_KIND)
+!!$#define MPP_TYPE_ complex(c4_kind)
 !!$#undef MPP_DO_UPDATE_AD_3D_
 !!$#define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_c4_3d
 !!$#include <mpp_do_update_ad.h>
 !!$#endif
 !!$
 !!$#undef MPP_TYPE_
-!!$#define MPP_TYPE_ integer(INT_KIND)
+!!$#define MPP_TYPE_ integer(i4_kind)
 !!$#undef MPP_DO_UPDATE_AD_3D_
 !!$#define MPP_DO_UPDATE_AD_3D_ mpp_do_update_ad_i4_3d
 !!$#include <mpp_do_update_ad.h>
@@ -1898,7 +1864,7 @@ end subroutine init_nonblock_type
 
 !********************************************************
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_DO_REDISTRIBUTE_3D_
 #define MPP_DO_REDISTRIBUTE_3D_ mpp_do_redistribute_r8_3D
 #include <mpp_do_redistribute.h>
@@ -1906,57 +1872,53 @@ end subroutine init_nonblock_type
 
 #ifdef OVERLOAD_C8
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #undef MPP_DO_REDISTRIBUTE_3D_
 #define MPP_DO_REDISTRIBUTE_3D_ mpp_do_redistribute_c8_3D
 #include <mpp_do_redistribute.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef MPP_DO_REDISTRIBUTE_3D_
 #define MPP_DO_REDISTRIBUTE_3D_ mpp_do_redistribute_i8_3D
 #include <mpp_do_redistribute.h>
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #undef MPP_DO_REDISTRIBUTE_3D_
 #define MPP_DO_REDISTRIBUTE_3D_ mpp_do_redistribute_l8_3D
 #include <mpp_do_redistribute.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_DO_REDISTRIBUTE_3D_
 #define MPP_DO_REDISTRIBUTE_3D_ mpp_do_redistribute_r4_3D
 #include <mpp_do_redistribute.h>
 #undef  VECTOR_FIELD_
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #undef MPP_DO_REDISTRIBUTE_3D_
 #define MPP_DO_REDISTRIBUTE_3D_ mpp_do_redistribute_c4_3D
 #include <mpp_do_redistribute.h>
 #endif
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef MPP_DO_REDISTRIBUTE_3D_
 #define MPP_DO_REDISTRIBUTE_3D_ mpp_do_redistribute_i4_3D
 #include <mpp_do_redistribute.h>
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #undef MPP_DO_REDISTRIBUTE_3D_
 #define MPP_DO_REDISTRIBUTE_3D_ mpp_do_redistribute_l4_3D
 #include <mpp_do_redistribute.h>
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_GET_BOUNDARY_2D_
 #define MPP_GET_BOUNDARY_2D_ mpp_get_boundary_r8_2d
 #undef MPP_GET_BOUNDARY_3D_
@@ -1976,7 +1938,7 @@ end subroutine init_nonblock_type
 #include <mpp_get_boundary.h>
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_GET_BOUNDARY_AD_2D_
 #define MPP_GET_BOUNDARY_AD_2D_ mpp_get_boundary_ad_r8_2d
 #undef MPP_GET_BOUNDARY_AD_3D_
@@ -1987,9 +1949,8 @@ end subroutine init_nonblock_type
 #define MPP_GET_BOUNDARY_AD_3D_V_ mpp_get_boundary_ad_r8_3dv
 #include <mpp_get_boundary_ad.h>
 
-#ifdef OVERLOAD_R4
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_GET_BOUNDARY_2D_
 #define MPP_GET_BOUNDARY_2D_ mpp_get_boundary_r4_2d
 #undef MPP_GET_BOUNDARY_3D_
@@ -2007,11 +1968,9 @@ end subroutine init_nonblock_type
 !#undef MPP_GET_BOUNDARY_5D_V_
 !#define MPP_GET_BOUNDARY_5D_V_ mpp_get_boundary_r4_5dv
 #include <mpp_get_boundary.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_GET_BOUNDARY_AD_2D_
 #define MPP_GET_BOUNDARY_AD_2D_ mpp_get_boundary_ad_r4_2d
 #undef MPP_GET_BOUNDARY_AD_3D_
@@ -2021,10 +1980,9 @@ end subroutine init_nonblock_type
 #undef MPP_GET_BOUNDARY_AD_3D_V_
 #define MPP_GET_BOUNDARY_AD_3D_V_ mpp_get_boundary_ad_r4_3dv
 #include <mpp_get_boundary_ad.h>
-#endif
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_DO_GET_BOUNDARY_3D_
 #define MPP_DO_GET_BOUNDARY_3D_ mpp_do_get_boundary_r8_3d
 #undef MPP_DO_GET_BOUNDARY_3DV_
@@ -2032,35 +1990,31 @@ end subroutine init_nonblock_type
 #include <mpp_do_get_boundary.h>
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPP_DO_GET_BOUNDARY_AD_3D_
 #define MPP_DO_GET_BOUNDARY_AD_3D_ mpp_do_get_boundary_ad_r8_3d
 #undef MPP_DO_GET_BOUNDARY_AD_3DV_
 #define MPP_DO_GET_BOUNDARY_AD_3D_V_ mpp_do_get_boundary_ad_r8_3dv
 #include <mpp_do_get_boundary_ad.h>
 
-#ifdef OVERLOAD_R4
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_DO_GET_BOUNDARY_3D_
 #define MPP_DO_GET_BOUNDARY_3D_ mpp_do_get_boundary_r4_3d
 #undef MPP_DO_GET_BOUNDARY_3D_V_
 #define MPP_DO_GET_BOUNDARY_3D_V_ mpp_do_get_boundary_r4_3dv
 #include <mpp_do_get_boundary.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPP_DO_GET_BOUNDARY_AD_3D_
 #define MPP_DO_GET_BOUNDARY_AD_3D_ mpp_do_get_boundary_ad_r4_3d
 #undef MPP_DO_GET_BOUNDARY_AD_3D_V_
 #define MPP_DO_GET_BOUNDARY_AD_3D_V_ mpp_do_get_boundary_ad_r4_3dv
 #include <mpp_do_get_boundary_ad.h>
-#endif
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL8
 #undef MPP_CREATE_GROUP_UPDATE_2D_
@@ -2096,7 +2050,7 @@ end subroutine init_nonblock_type
 #include <mpp_group_update.h>
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef MPI_TYPE_
 #define MPI_TYPE_ MPI_REAL4
 #undef MPP_CREATE_GROUP_UPDATE_2D_
diff --git a/mpp/include/mpp_domains_reduce.inc b/mpp/include/mpp_domains_reduce.inc
index 7b87779a54..e5570e1fd5 100644
--- a/mpp/include/mpp_domains_reduce.inc
+++ b/mpp/include/mpp_domains_reduce.inc
@@ -34,7 +34,7 @@
 #undef MPP_GLOBAL_REDUCE_5D_
 #define MPP_GLOBAL_REDUCE_5D_ mpp_global_max_r8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef REDUCE_VAL_
 #define REDUCE_VAL_ maxval
 #undef REDUCE_LOC_
@@ -52,7 +52,7 @@
 #undef MPP_GLOBAL_REDUCE_5D_
 #define MPP_GLOBAL_REDUCE_5D_ mpp_global_min_r8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef REDUCE_VAL_
 #define REDUCE_VAL_ minval
 #undef REDUCE_LOC_
@@ -61,7 +61,6 @@
 #define MPP_REDUCE_ mpp_min
 #include <mpp_global_reduce.h>
 
-#ifdef OVERLOAD_R4
 #undef MPP_GLOBAL_REDUCE_2D_
 #define MPP_GLOBAL_REDUCE_2D_ mpp_global_max_r4_2d
 #undef MPP_GLOBAL_REDUCE_3D_
@@ -71,7 +70,7 @@
 #undef MPP_GLOBAL_REDUCE_5D_
 #define MPP_GLOBAL_REDUCE_5D_ mpp_global_max_r4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef REDUCE_VAL_
 #define REDUCE_VAL_ maxval
 #undef REDUCE_LOC_
@@ -89,7 +88,7 @@
 #undef MPP_GLOBAL_REDUCE_5D_
 #define MPP_GLOBAL_REDUCE_5D_ mpp_global_min_r4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef REDUCE_VAL_
 #define REDUCE_VAL_ minval
 #undef REDUCE_LOC_
@@ -97,9 +96,7 @@
 #undef MPP_REDUCE_
 #define MPP_REDUCE_ mpp_min
 #include <mpp_global_reduce.h>
-#endif
 
-#ifndef no_8byte_integers
 #undef MPP_GLOBAL_REDUCE_2D_
 #define MPP_GLOBAL_REDUCE_2D_ mpp_global_max_i8_2d
 #undef MPP_GLOBAL_REDUCE_3D_
@@ -109,7 +106,7 @@
 #undef MPP_GLOBAL_REDUCE_5D_
 #define MPP_GLOBAL_REDUCE_5D_ mpp_global_max_i8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef REDUCE_VAL_
 #define REDUCE_VAL_ maxval
 #undef REDUCE_LOC_
@@ -127,7 +124,7 @@
 #undef MPP_GLOBAL_REDUCE_5D_
 #define MPP_GLOBAL_REDUCE_5D_ mpp_global_min_i8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #undef REDUCE_VAL_
 #define REDUCE_VAL_ minval
 #undef REDUCE_LOC_
@@ -135,7 +132,6 @@
 #undef MPP_REDUCE_
 #define MPP_REDUCE_ mpp_min
 #include <mpp_global_reduce.h>
-#endif
 
 #undef MPP_GLOBAL_REDUCE_2D_
 #define MPP_GLOBAL_REDUCE_2D_ mpp_global_max_i4_2d
@@ -146,7 +142,7 @@
 #undef MPP_GLOBAL_REDUCE_5D_
 #define MPP_GLOBAL_REDUCE_5D_ mpp_global_max_i4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef REDUCE_VAL_
 #define REDUCE_VAL_ maxval
 #undef REDUCE_LOC_
@@ -164,7 +160,7 @@
 #undef MPP_GLOBAL_REDUCE_5D_
 #define MPP_GLOBAL_REDUCE_5D_ mpp_global_min_i4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef REDUCE_VAL_
 #define REDUCE_VAL_ minval
 #undef REDUCE_LOC_
@@ -184,7 +180,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -192,7 +188,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -200,7 +196,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -208,16 +204,15 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum.h>
 
-#ifdef OVERLOAD_R4
 #undef MPP_GLOBAL_SUM_
 #define MPP_GLOBAL_SUM_ mpp_global_sum_r4_2d
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -225,7 +220,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -233,7 +228,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -241,9 +236,8 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum.h>
-#endif
 
 #undef DO_EFP_SUM_
 
@@ -253,7 +247,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -261,7 +255,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -269,7 +263,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -277,7 +271,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum.h>
 #endif
 
@@ -287,7 +281,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -295,7 +289,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -303,7 +297,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -311,17 +305,16 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_GLOBAL_SUM_
 #define MPP_GLOBAL_SUM_ mpp_global_sum_i8_2d
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -329,7 +322,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -337,7 +330,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -345,16 +338,15 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum.h>
-#endif
 
 #undef MPP_GLOBAL_SUM_
 #define MPP_GLOBAL_SUM_ mpp_global_sum_i4_2d
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -362,7 +354,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -370,7 +362,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum.h>
 
 #undef MPP_GLOBAL_SUM_
@@ -378,7 +370,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum.h>
 
 
@@ -394,7 +386,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -402,7 +394,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -410,7 +402,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -418,7 +410,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum_tl.h>
 
 #ifdef OVERLOAD_C8
@@ -427,7 +419,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -435,7 +427,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -443,7 +435,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -451,17 +443,16 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum_tl.h>
 #endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_GLOBAL_SUM_TL_
 #define MPP_GLOBAL_SUM_TL_ mpp_global_sum_tl_r4_2d
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -469,7 +460,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -477,7 +468,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -485,9 +476,8 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum_tl.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_GLOBAL_SUM_TL_
@@ -495,7 +485,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -503,7 +493,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -511,7 +501,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -519,17 +509,16 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum_tl.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_GLOBAL_SUM_TL_
 #define MPP_GLOBAL_SUM_TL_ mpp_global_sum_tl_i8_2d
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -537,7 +526,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -545,7 +534,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -553,16 +542,15 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum_tl.h>
-#endif
 
 #undef MPP_GLOBAL_SUM_TL_
 #define MPP_GLOBAL_SUM_TL_ mpp_global_sum_tl_i4_2d
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -570,7 +558,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -578,7 +566,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum_tl.h>
 
 #undef MPP_GLOBAL_SUM_TL_
@@ -586,7 +574,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum_tl.h>
 !gag
 
@@ -602,7 +590,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -610,7 +598,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -618,7 +606,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -626,7 +614,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_sum_ad.h>
 
 #ifdef OVERLOAD_C8
@@ -635,7 +623,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -643,7 +631,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -651,7 +639,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -659,17 +647,16 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_sum_ad.h>
 #endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_GLOBAL_SUM_AD_
 #define MPP_GLOBAL_SUM_AD_ mpp_global_sum_ad_r4_2d
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -677,7 +664,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -685,7 +672,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -693,9 +680,8 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_sum_ad.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_GLOBAL_SUM_AD_
@@ -703,7 +689,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -711,7 +697,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -719,7 +705,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -727,17 +713,16 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_sum_ad.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_GLOBAL_SUM_AD_
 #define MPP_GLOBAL_SUM_AD_ mpp_global_sum_ad_i8_2d
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -745,7 +730,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -753,7 +738,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -761,16 +746,15 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_sum_ad.h>
-#endif
 
 #undef MPP_GLOBAL_SUM_AD_
 #define MPP_GLOBAL_SUM_AD_ mpp_global_sum_ad_i4_2d
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -778,7 +762,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -786,7 +770,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum_ad.h>
 
 #undef MPP_GLOBAL_SUM_AD_
@@ -794,7 +778,7 @@
 #undef MPP_EXTRA_INDICES_
 #define MPP_EXTRA_INDICES_ ,:,:,:
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_sum_ad.h>
 !bnc
 
@@ -813,7 +797,7 @@
 #undef MPP_GLOBAL_FIELD_5D_
 #define MPP_GLOBAL_FIELD_5D_ mpp_global_field2D_r8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_field.h>
 
 #ifdef OVERLOAD_C8
@@ -826,11 +810,10 @@
 #undef MPP_GLOBAL_FIELD_5D_
 #define MPP_GLOBAL_FIELD_5D_ mpp_global_field2D_c8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_field.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_GLOBAL_FIELD_2D_
 #define MPP_GLOBAL_FIELD_2D_ mpp_global_field2D_i8_2d
 #undef MPP_GLOBAL_FIELD_3D_
@@ -840,7 +823,7 @@
 #undef MPP_GLOBAL_FIELD_5D_
 #define MPP_GLOBAL_FIELD_5D_ mpp_global_field2D_i8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_field.h>
 
 #undef MPP_GLOBAL_FIELD_2D_
@@ -852,11 +835,9 @@
 #undef MPP_GLOBAL_FIELD_5D_
 #define MPP_GLOBAL_FIELD_5D_ mpp_global_field2D_l8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #include <mpp_global_field.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_GLOBAL_FIELD_2D_
 #define MPP_GLOBAL_FIELD_2D_ mpp_global_field2D_r4_2d
 #undef MPP_GLOBAL_FIELD_3D_
@@ -866,9 +847,8 @@
 #undef MPP_GLOBAL_FIELD_5D_
 #define MPP_GLOBAL_FIELD_5D_ mpp_global_field2D_r4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_field.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_GLOBAL_FIELD_2D_
@@ -880,7 +860,7 @@
 #undef MPP_GLOBAL_FIELD_5D_
 #define MPP_GLOBAL_FIELD_5D_ mpp_global_field2D_c4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_field.h>
 #endif
 
@@ -893,7 +873,7 @@
 #undef MPP_GLOBAL_FIELD_5D_
 #define MPP_GLOBAL_FIELD_5D_ mpp_global_field2D_i4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_field.h>
 
 #undef MPP_GLOBAL_FIELD_2D_
@@ -905,7 +885,7 @@
 #undef MPP_GLOBAL_FIELD_5D_
 #define MPP_GLOBAL_FIELD_5D_ mpp_global_field2D_l4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #include <mpp_global_field.h>
 
 !****************************************************
@@ -918,7 +898,7 @@
 #undef MPP_GLOBAL_FIELD_5D_AD_
 #define MPP_GLOBAL_FIELD_5D_AD_ mpp_global_field2D_r8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_field_ad.h>
 
 #ifdef OVERLOAD_C8
@@ -931,11 +911,10 @@
 #undef MPP_GLOBAL_FIELD_5D_AD_
 #define MPP_GLOBAL_FIELD_5D_AD_ mpp_global_field2D_c8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_global_field_ad.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_GLOBAL_FIELD_2D_AD_
 #define MPP_GLOBAL_FIELD_2D_AD_ mpp_global_field2D_i8_2d_ad
 #undef MPP_GLOBAL_FIELD_3D_AD_
@@ -945,7 +924,7 @@
 #undef MPP_GLOBAL_FIELD_5D_AD_
 #define MPP_GLOBAL_FIELD_5D_AD_ mpp_global_field2D_i8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_field_ad.h>
 
 #undef MPP_GLOBAL_FIELD_2D_AD_
@@ -957,11 +936,9 @@
 #undef MPP_GLOBAL_FIELD_5D_AD_
 #define MPP_GLOBAL_FIELD_5D_AD_ mpp_global_field2D_l8_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #include <mpp_global_field_ad.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_GLOBAL_FIELD_2D_AD_
 #define MPP_GLOBAL_FIELD_2D_AD_ mpp_global_field2D_r4_2d_ad
 #undef MPP_GLOBAL_FIELD_3D_AD_
@@ -971,9 +948,8 @@
 #undef MPP_GLOBAL_FIELD_5D_AD_
 #define MPP_GLOBAL_FIELD_5D_AD_ mpp_global_field2D_r4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_field_ad.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_GLOBAL_FIELD_2D_AD_
@@ -985,7 +961,7 @@
 #undef MPP_GLOBAL_FIELD_5D_AD_
 #define MPP_GLOBAL_FIELD_5D_AD_ mpp_global_field2D_c4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_global_field_ad.h>
 #endif
 
@@ -998,7 +974,7 @@
 #undef MPP_GLOBAL_FIELD_5D_AD_
 #define MPP_GLOBAL_FIELD_5D_AD_ mpp_global_field2D_i4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_field_ad.h>
 
 #undef MPP_GLOBAL_FIELD_2D_AD_
@@ -1010,7 +986,7 @@
 #undef MPP_GLOBAL_FIELD_5D_AD_
 #define MPP_GLOBAL_FIELD_5D_AD_ mpp_global_field2D_l4_5d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #include <mpp_global_field_ad.h>
 
 !****************************************************
@@ -1019,7 +995,7 @@
 #define MPP_DO_GLOBAL_FIELD_3D_ mpp_do_global_field2D_r8_3d
 #define MPP_DO_GLOBAL_FIELD_A2A_3D_ mpp_do_global_field2D_a2a_r8_3d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_do_global_field.h>
 
 #ifdef OVERLOAD_C8
@@ -1028,17 +1004,16 @@
 #define MPP_DO_GLOBAL_FIELD_3D_ mpp_do_global_field2D_c8_3d
 #define MPP_DO_GLOBAL_FIELD_A2A_3D_ mpp_do_global_field2D_a2a_c8_3d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_do_global_field.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_DO_GLOBAL_FIELD_3D_
 #undef MPP_DO_GLOBAL_FIELD_A2A_3D_
 #define MPP_DO_GLOBAL_FIELD_3D_ mpp_do_global_field2D_i8_3d
 #define MPP_DO_GLOBAL_FIELD_A2A_3D_ mpp_do_global_field2D_a2a_i8_3d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_do_global_field.h>
 
 #undef MPP_DO_GLOBAL_FIELD_3D_
@@ -1047,20 +1022,17 @@
 #define MPP_DO_GLOBAL_FIELD_A2A_3D_ mpp_do_global_field2D_a2a_l8_3d
 #define LOGICAL_VARIABLE
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #include <mpp_do_global_field.h>
 #undef LOGICAL_VARIABLE
-#endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_DO_GLOBAL_FIELD_3D_
 #undef MPP_DO_GLOBAL_FIELD_A2A_3D_
 #define MPP_DO_GLOBAL_FIELD_3D_ mpp_do_global_field2D_r4_3d
 #define MPP_DO_GLOBAL_FIELD_A2A_3D_ mpp_do_global_field2D_a2a_r4_3d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_do_global_field.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_DO_GLOBAL_FIELD_3D_
@@ -1068,7 +1040,7 @@
 #define MPP_DO_GLOBAL_FIELD_3D_ mpp_do_global_field2D_c4_3d
 #define MPP_DO_GLOBAL_FIELD_A2A_3D_ mpp_do_global_field2D_a2a_c4_3d
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_do_global_field.h>
 #endif
 
@@ -1077,7 +1049,7 @@
 #define MPP_DO_GLOBAL_FIELD_3D_ mpp_do_global_field2D_i4_3d
 #define MPP_DO_GLOBAL_FIELD_A2A_3D_ mpp_do_global_field2D_a2a_i4_3d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_do_global_field.h>
 
 #undef MPP_DO_GLOBAL_FIELD_3D_
@@ -1086,66 +1058,62 @@
 #define MPP_DO_GLOBAL_FIELD_A2A_3D_ mpp_do_global_field2D_a2a_l4_3d
 #define LOGICAL_VARIABLE
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #include <mpp_do_global_field.h>
 #undef LOGICAL_VARIABLE
 !****************************************************
 #undef MPP_DO_GLOBAL_FIELD_3D_AD_
 #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_r8_3d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_do_global_field_ad.h>
 
 #ifdef OVERLOAD_C8
 #undef MPP_DO_GLOBAL_FIELD_3D_AD_
 #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_c8_3d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(DOUBLE_KIND)
+#define MPP_TYPE_ complex(c8_kind)
 #include <mpp_do_global_field_ad.h>
 #endif
 
-#ifndef no_8byte_integers
 #undef MPP_DO_GLOBAL_FIELD_3D_AD_
 #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_i8_3d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_do_global_field_ad.h>
 
 #undef MPP_DO_GLOBAL_FIELD_3D_AD_
 #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_l8_3d_ad
 #define LOGICAL_VARIABLE
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(LONG_KIND)
+#define MPP_TYPE_ logical(l8_kind)
 #include <mpp_do_global_field_ad.h>
 #undef LOGICAL_VARIABLE
-#endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_DO_GLOBAL_FIELD_3D_AD_
 #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_r4_3d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_do_global_field_ad.h>
-#endif
 
 #ifdef OVERLOAD_C4
 #undef MPP_DO_GLOBAL_FIELD_3D_AD_
 #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_c4_3d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ complex(FLOAT_KIND)
+#define MPP_TYPE_ complex(c4_kind)
 #include <mpp_do_global_field_ad.h>
 #endif
 
 #undef MPP_DO_GLOBAL_FIELD_3D_AD_
 #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_i4_3d_ad
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_do_global_field_ad.h>
 
 #undef MPP_DO_GLOBAL_FIELD_3D_AD_
 #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_l4_3d_ad
 #define LOGICAL_VARIABLE
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(l4_kind)
 #include <mpp_do_global_field_ad.h>
 #undef LOGICAL_VARIABLE
diff --git a/mpp/include/mpp_get_boundary.h b/mpp/include/mpp_get_boundary.h
index fdc891df33..68ec1a2da5 100644
--- a/mpp/include/mpp_get_boundary.h
+++ b/mpp/include/mpp_get_boundary.h
@@ -34,8 +34,8 @@ subroutine MPP_GET_BOUNDARY_2D_(field, domain, ebuffer, sbuffer, wbuffer, nbuffe
 
   integer                  :: ntile
   logical                  :: need_ebuffer, need_sbuffer, need_wbuffer, need_nbuffer
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrs=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrs=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrs=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrs=-9999
   integer, save    :: bsize(4)=0, isize=0, jsize=0, ksize=0, pos, list=0, l_size=0, upflags
   integer          :: buffer_size(4)
   integer          :: max_ntile, tile, update_position, ishift, jshift
@@ -175,8 +175,8 @@ subroutine MPP_GET_BOUNDARY_3D_(field, domain, ebuffer, sbuffer, wbuffer, nbuffe
 
   integer                  :: ntile
   logical                  :: need_ebuffer, need_sbuffer, need_wbuffer, need_nbuffer
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrs=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrs=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrs=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrs=-9999
   integer, save    :: bsize(4)=0, isize=0, jsize=0, ksize=0, pos, list=0, l_size=0, upflags
   integer          :: buffer_size(4)
   integer          :: max_ntile, tile, update_position, ishift, jshift
@@ -321,10 +321,10 @@ subroutine MPP_GET_BOUNDARY_2D_V_(fieldx, fieldy, domain, ebufferx, sbufferx, wb
   logical                 :: need_ebufferx, need_sbufferx, need_wbufferx, need_nbufferx
   logical                 :: need_ebuffery, need_sbuffery, need_wbuffery, need_nbuffery
 
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsx=-9999
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsy=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsx=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsy=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsx=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsy=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsx=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsy=-9999
   integer, save    :: bsizex(4)=0, bsizey(4)=0, isize(2)=0, jsize(2)=0, ksize=0, l_size=0, list=0
   integer, save    :: offset_type, upflags
   integer          :: bufferx_size(4), buffery_size(4)
@@ -543,10 +543,10 @@ subroutine MPP_GET_BOUNDARY_3D_V_(fieldx, fieldy, domain, ebufferx, sbufferx, wb
   logical                 :: need_ebufferx, need_sbufferx, need_wbufferx, need_nbufferx
   logical                 :: need_ebuffery, need_sbuffery, need_wbuffery, need_nbuffery
 
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsx=-9999
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsy=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsx=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsy=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsx=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsy=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsx=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsy=-9999
   integer, save    :: bsizex(4)=0, bsizey(4)=0, isize(2)=0, jsize(2)=0, ksize=0, l_size=0, list=0
   integer, save    :: offset_type, upflags
   integer          :: bufferx_size(4), buffery_size(4)
diff --git a/mpp/include/mpp_get_boundary_ad.h b/mpp/include/mpp_get_boundary_ad.h
index d131b76efb..91c865a659 100644
--- a/mpp/include/mpp_get_boundary_ad.h
+++ b/mpp/include/mpp_get_boundary_ad.h
@@ -36,8 +36,8 @@ subroutine MPP_GET_BOUNDARY_AD_2D_(field, domain, ebuffer, sbuffer, wbuffer, nbu
 
   integer                  :: ntile
   logical                  :: need_ebuffer, need_sbuffer, need_wbuffer, need_nbuffer
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrs=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrs=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrs=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrs=-9999
   integer, save    :: bsize(4)=0, isize=0, jsize=0, ksize=0, pos, list=0, l_size=0, upflags
   integer          :: buffer_size(4)
   integer          :: max_ntile, tile, update_position, ishift, jshift
@@ -177,8 +177,8 @@ subroutine MPP_GET_BOUNDARY_AD_3D_(field, domain, ebuffer, sbuffer, wbuffer, nbu
 
   integer                  :: ntile
   logical                  :: need_ebuffer, need_sbuffer, need_wbuffer, need_nbuffer
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrs=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrs=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrs=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrs=-9999
   integer, save    :: bsize(4)=0, isize=0, jsize=0, ksize=0, pos, list=0, l_size=0, upflags
   integer          :: buffer_size(4)
   integer          :: max_ntile, tile, update_position, ishift, jshift
@@ -323,10 +323,10 @@ subroutine MPP_GET_BOUNDARY_AD_2D_V_(fieldx, fieldy, domain, ebufferx, sbufferx,
   logical                 :: need_ebufferx, need_sbufferx, need_wbufferx, need_nbufferx
   logical                 :: need_ebuffery, need_sbuffery, need_wbuffery, need_nbuffery
 
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsx=-9999
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsy=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsx=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsy=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsx=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsy=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsx=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsy=-9999
   integer, save    :: bsizex(4)=0, bsizey(4)=0, isize(2)=0, jsize(2)=0, ksize=0, l_size=0, list=0
   integer, save    :: offset_type, upflags
   integer          :: bufferx_size(4), buffery_size(4)
@@ -545,10 +545,10 @@ subroutine MPP_GET_BOUNDARY_AD_3D_V_(fieldx, fieldy, domain, ebufferx, sbufferx,
   logical                 :: need_ebufferx, need_sbufferx, need_wbufferx, need_nbufferx
   logical                 :: need_ebuffery, need_sbuffery, need_wbuffery, need_nbuffery
 
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsx=-9999
-  integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsy=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsx=-9999
-  integer(LONG_KIND),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsy=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsx=-9999
+  integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),  save :: f_addrsy=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsx=-9999
+  integer(i8_kind),dimension(4,MAX_DOMAIN_FIELDS, MAX_TILES),save :: b_addrsy=-9999
   integer, save    :: bsizex(4)=0, bsizey(4)=0, isize(2)=0, jsize(2)=0, ksize=0, l_size=0, list=0
   integer, save    :: offset_type, upflags
   integer          :: bufferx_size(4), buffery_size(4)
diff --git a/mpp/include/mpp_io_connect.inc b/mpp/include/mpp_io_connect.inc
index e4160076c9..b0c3d35e30 100644
--- a/mpp/include/mpp_io_connect.inc
+++ b/mpp/include/mpp_io_connect.inc
@@ -188,8 +188,8 @@
 !ug support
       type(domain2d),pointer :: io_domain
       type(domainUG),pointer :: io_domain_ug
-      integer(INT_KIND)      :: io_layout_ug
-      integer(INT_KIND)      :: tile_id_ug
+      integer(i4_kind)      :: io_layout_ug
+      integer(i4_kind)      :: tile_id_ug
 !----------
       integer*8                             :: lenp
       integer                               :: comm
diff --git a/mpp/include/mpp_io_misc.inc b/mpp/include/mpp_io_misc.inc
index 52705407b9..a21541b8b1 100644
--- a/mpp/include/mpp_io_misc.inc
+++ b/mpp/include/mpp_io_misc.inc
@@ -49,7 +49,7 @@
       integer                       :: unit_nml, io_status, iunit
       integer                       :: logunit, outunit, inunit, errunit
       logical                       :: opened
-      real(DOUBLE_KIND)             :: doubledata = 0
+      real(r8_kind)             :: doubledata = 0
       real                          :: realarray(4)
 
       if( module_is_initialized )return
diff --git a/mpp/include/mpp_io_read.inc b/mpp/include/mpp_io_read.inc
index 888aa411bb..7890785be0 100644
--- a/mpp/include/mpp_io_read.inc
+++ b/mpp/include/mpp_io_read.inc
@@ -36,24 +36,22 @@
 #undef MPP_READ_2DDECOMP_4D_
 #define MPP_READ_2DDECOMP_4D_ mpp_read_2ddecomp_r4d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_read_2Ddecomp.h>
 
-#ifdef OVERLOAD_R8
 #undef READ_RECORD_CORE_
-#define READ_RECORD_CORE_ read_record_core_r8
+#define READ_RECORD_CORE_ read_record_core_r4
 #undef READ_RECORD_
-#define READ_RECORD_ read_record_r8
+#define READ_RECORD_ read_record_r4
 #undef MPP_READ_2DDECOMP_2D_
-#define MPP_READ_2DDECOMP_2D_ mpp_read_2ddecomp_r2d_r8
+#define MPP_READ_2DDECOMP_2D_ mpp_read_2ddecomp_r2d_r4
 #undef MPP_READ_2DDECOMP_3D_
-#define MPP_READ_2DDECOMP_3D_ mpp_read_2ddecomp_r3d_r8
+#define MPP_READ_2DDECOMP_3D_ mpp_read_2ddecomp_r3d_r4
 #undef MPP_READ_2DDECOMP_4D_
-#define MPP_READ_2DDECOMP_4D_ mpp_read_2ddecomp_r4d_r8
+#define MPP_READ_2DDECOMP_4D_ mpp_read_2ddecomp_r4d_r4
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_read_2Ddecomp.h>
-#endif
 
 #undef MPP_READ_COMPRESSED_1D_
 #define MPP_READ_COMPRESSED_1D_ mpp_read_compressed_r1d
@@ -131,7 +129,7 @@
     subroutine mpp_read_region_r2D(unit, field, data, start, nread)
       integer,         intent(in) :: unit
       type(fieldtype), intent(in) :: field
-      real,         intent(inout) :: data(:,:)
+      real(r4_kind),         intent(inout) :: data(:,:)
       integer,         intent(in) :: start(:), nread(:)
 
       if(size(start(:)) .NE. 4 .OR. size(nread(:)) .NE. 4) call mpp_error(FATAL, &
@@ -142,7 +140,7 @@
       endif
       if(nread(3) .NE. 1 .OR. nread(4) .NE. 1) call mpp_error(FATAL, &
           "mpp_io_read.inc(mpp_read_region_r2D): nread(3) and nread(4) must be 1")
-      call read_record_core(unit, field, nread(1)*nread(2), data, start, nread)
+      call read_record_core_r4(unit, field, nread(1)*nread(2), data, start, nread)
 
       return
 
@@ -152,7 +150,7 @@
     subroutine mpp_read_region_r3D(unit, field, data, start, nread)
       integer,         intent(in) :: unit
       type(fieldtype), intent(in) :: field
-      real,         intent(inout) :: data(:,:,:)
+      real(r4_kind),         intent(inout) :: data(:,:,:)
       integer,         intent(in) :: start(:), nread(:)
 
       if(size(start(:)) .NE. 4 .OR. size(nread(:)) .NE. 4) call mpp_error(FATAL, &
@@ -163,16 +161,15 @@
       endif
       if(nread(4) .NE. 1) call mpp_error(FATAL, &
           "mpp_io_read.inc(mpp_read_region_r3D): nread(4) must be 1")
-      call read_record_core(unit, field, nread(1)*nread(2)*nread(3), data, start, nread)
+      call read_record_core_r4(unit, field, nread(1)*nread(2)*nread(3), data, start, nread)
 
       return
     end subroutine mpp_read_region_r3D
 
-#ifdef OVERLOAD_R8
     subroutine mpp_read_region_r2D_r8(unit, field, data, start, nread)
       integer,                intent(in) :: unit
       type(fieldtype),        intent(in) :: field
-      real(kind=DOUBLE_KIND), intent(inout) :: data(:,:)
+      real(kind=r8_kind), intent(inout) :: data(:,:)
       integer,                intent(in) :: start(:), nread(:)
 
       if(size(start(:)) .NE. 4 .OR. size(nread(:)) .NE. 4) call mpp_error(FATAL, &
@@ -183,7 +180,7 @@
       endif
       if(nread(3) .NE. 1 .OR. nread(4) .NE. 1) call mpp_error(FATAL, &
           "mpp_io_read.inc(mpp_read_region_r2D_r8): nread(3) and nread(4) must be 1")
-      call read_record_core_r8(unit, field, nread(1)*nread(2), data, start, nread)
+      call read_record_core(unit, field, nread(1)*nread(2), data, start, nread)
 
       return
     end subroutine mpp_read_region_r2D_r8
@@ -191,7 +188,7 @@
     subroutine mpp_read_region_r3D_r8(unit, field, data, start, nread)
       integer,                intent(in) :: unit
       type(fieldtype),        intent(in) :: field
-      real(kind=DOUBLE_KIND), intent(inout) :: data(:,:,:)
+      real(kind=r8_kind), intent(inout) :: data(:,:,:)
       integer,                intent(in) :: start(:), nread(:)
 
       if(size(start(:)) .NE. 4 .OR. size(nread(:)) .NE. 4) call mpp_error(FATAL, &
@@ -202,11 +199,10 @@
       endif
       if(nread(4) .NE. 1) call mpp_error(FATAL, &
           "mpp_io_read.inc(mpp_read_region_r3D_r8): nread(4) must be 1")
-      call read_record_core_r8(unit, field, nread(1)*nread(2)*nread(3), data, start, nread)
+      call read_record_core(unit, field, nread(1)*nread(2)*nread(3), data, start, nread)
 
       return
     end subroutine mpp_read_region_r3D_r8
-#endif
 
 
     !--- Assume the text field is at most two-dimensional
@@ -322,14 +318,14 @@
       integer, allocatable, dimension(:) :: dimids
       character(len=128) :: name, attname, unlimname, attval, bounds_name
       logical :: isdim, found_bounds, get_time_info
-      integer(LONG_KIND) :: checksumf
+      integer(i8_kind) :: checksumf
       character(len=64)  :: checksum_char
       integer :: num_checksumf, last, is, k
 
-      integer(SHORT_KIND), allocatable :: i2vals(:)
-      integer(INT_KIND), allocatable :: ivals(:)
-      real(FLOAT_KIND), allocatable  :: rvals(:)
-      real(DOUBLE_KIND), allocatable :: r8vals(:)
+      integer(i2_kind), allocatable :: i2vals(:)
+      integer(i4_kind), allocatable :: ivals(:)
+      real(r4_kind), allocatable  :: rvals(:)
+      real(r8_kind), allocatable :: r8vals(:)
 
       get_time_info = .TRUE.
       if(present(read_time)) get_time_info = read_time
diff --git a/mpp/include/mpp_io_unstructured_read.inc b/mpp/include/mpp_io_unstructured_read.inc
index f277baa8b9..ce64ad7ca5 100644
--- a/mpp/include/mpp_io_unstructured_read.inc
+++ b/mpp/include/mpp_io_unstructured_read.inc
@@ -33,23 +33,23 @@ subroutine mpp_io_unstructured_read_r_1D(funit, &
                                          threading)
 
    !Inputs/outputs
-    integer(INT_KIND),intent(in)                       :: funit     !<A file unit returned by mpp_open.
+    integer(i4_kind),intent(in)                       :: funit     !<A file unit returned by mpp_open.
     type(fieldtype),intent(in)                         :: field     !<A field whose data will be read in from the file.
     type(domainUG),intent(in)                          :: domain    !<An unstructured mpp domain.
     real,dimension(:),intent(inout)                    :: fdata     !<The data that will be read in from the file.
-    integer(INT_KIND),intent(in),optional              :: tindex    !<Time level index for a NetCDF file.
-    integer(INT_KIND),dimension(:),intent(in),optional :: start     !<Corner indices for a NetCDF file.
-    integer(INT_KIND),dimension(:),intent(in),optional :: nread     !<Edge lengths for a NetCDF file.
-    integer(INT_KIND),intent(in),optional              :: threading !<Flag telling whether one or multiple ranks will read the file.
+    integer(i4_kind),intent(in),optional              :: tindex    !<Time level index for a NetCDF file.
+    integer(i4_kind),dimension(:),intent(in),optional :: start     !<Corner indices for a NetCDF file.
+    integer(i4_kind),dimension(:),intent(in),optional :: nread     !<Edge lengths for a NetCDF file.
+    integer(i4_kind),intent(in),optional              :: threading !<Flag telling whether one or multiple ranks will read the file.
 
    !Local variables
-    integer(INT_KIND)                          :: threading_flag !<Flag telling whether one or multiple ranks will read the file.  This defaults to MPP_SINGLE.
+    integer(i4_kind)                          :: threading_flag !<Flag telling whether one or multiple ranks will read the file.  This defaults to MPP_SINGLE.
     type(domainUG),pointer                     :: io_domain      !<Pointer to the unstructured I/O domain.
-    integer(INT_KIND)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
-    integer(INT_KIND),dimension(:),allocatable :: pelist         !<A pelist.
-    integer(INT_KIND)                          :: p              !<Loop variable.
-    logical(INT_KIND)                          :: compute_chksum !<Flag telling whether or not a check-sum of the read-in data is calculated.
-    integer(LONG_KIND)                         :: chk            !<Calculated check-sum for the read in data.
+    integer(i4_kind)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
+    integer(i4_kind),dimension(:),allocatable :: pelist         !<A pelist.
+    integer(i4_kind)                          :: p              !<Loop variable.
+    logical(l4_kind)                          :: compute_chksum !<Flag telling whether or not a check-sum of the read-in data is calculated.
+    integer(i8_kind)                         :: chk            !<Calculated check-sum for the read in data.
 
    !Start the mpp timer.
    !mpp_read_clock is a module variable.
@@ -213,23 +213,23 @@ subroutine mpp_io_unstructured_read_r_2D(funit, &
                                          threading)
 
    !Inputs/outputs
-    integer(INT_KIND),intent(in)                       :: funit     !<A file unit returned by mpp_open.
+    integer(i4_kind),intent(in)                       :: funit     !<A file unit returned by mpp_open.
     type(fieldtype),intent(in)                         :: field     !<A field whose data will be read in from the file.
     type(domainUG),intent(in)                          :: domain    !<An unstructured mpp domain.
     real,dimension(:,:),intent(inout)                  :: fdata     !<The data that will be read in from the file.
-    integer(INT_KIND),intent(in),optional              :: tindex    !<Time level index for a NetCDF file.
-    integer(INT_KIND),dimension(:),intent(in),optional :: start     !<Corner indices for a NetCDF file.
-    integer(INT_KIND),dimension(:),intent(in),optional :: nread     !<Edge lengths for a NetCDF file.
-    integer(INT_KIND),intent(in),optional              :: threading !<Flag telling whether one or multiple ranks will read the file.
+    integer(i4_kind),intent(in),optional              :: tindex    !<Time level index for a NetCDF file.
+    integer(i4_kind),dimension(:),intent(in),optional :: start     !<Corner indices for a NetCDF file.
+    integer(i4_kind),dimension(:),intent(in),optional :: nread     !<Edge lengths for a NetCDF file.
+    integer(i4_kind),intent(in),optional              :: threading !<Flag telling whether one or multiple ranks will read the file.
 
    !Local variables
-    integer(INT_KIND)                          :: threading_flag !<Flag telling whether one or multiple ranks will read the file.  This defaults to MPP_SINGLE.
+    integer(i4_kind)                          :: threading_flag !<Flag telling whether one or multiple ranks will read the file.  This defaults to MPP_SINGLE.
     type(domainUG),pointer                     :: io_domain      !<Pointer to the unstructured I/O domain.
-    integer(INT_KIND)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
-    integer(INT_KIND),dimension(:),allocatable :: pelist         !<A pelist.
-    integer(INT_KIND)                          :: p              !<Loop variable.
-    logical(INT_KIND)                          :: compute_chksum !<Flag telling whether or not a check-sum of the read-in data is calculated.
-    integer(LONG_KIND)                         :: chk            !<Calculated check-sum for the read in data.
+    integer(i4_kind)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
+    integer(i4_kind),dimension(:),allocatable :: pelist         !<A pelist.
+    integer(i4_kind)                          :: p              !<Loop variable.
+    logical(l4_kind)                          :: compute_chksum !<Flag telling whether or not a check-sum of the read-in data is calculated.
+    integer(i8_kind)                         :: chk            !<Calculated check-sum for the read in data.
 
    !Start the mpp timer.
    !mpp_read_clock is a module variable.
@@ -393,23 +393,23 @@ subroutine mpp_io_unstructured_read_r_3D(funit, &
                                          threading)
 
    !Inputs/outputs
-    integer(INT_KIND),intent(in)                       :: funit     !<A file unit returned by mpp_open.
+    integer(i4_kind),intent(in)                       :: funit     !<A file unit returned by mpp_open.
     type(fieldtype),intent(in)                         :: field     !<A field whose data will be read in from the file.
     type(domainUG),intent(in)                          :: domain    !<An unstructured mpp domain.
     real,dimension(:,:,:),intent(inout)                :: fdata     !<The data that will be read in from the file.
-    integer(INT_KIND),intent(in),optional              :: tindex    !<Time level index for a NetCDF file.
-    integer(INT_KIND),dimension(:),intent(in),optional :: start     !<Corner indices for a NetCDF file.
-    integer(INT_KIND),dimension(:),intent(in),optional :: nread     !<Edge lengths for a NetCDF file.
-    integer(INT_KIND),intent(in),optional              :: threading !<Flag telling whether one or multiple ranks will read the file.
+    integer(i4_kind),intent(in),optional              :: tindex    !<Time level index for a NetCDF file.
+    integer(i4_kind),dimension(:),intent(in),optional :: start     !<Corner indices for a NetCDF file.
+    integer(i4_kind),dimension(:),intent(in),optional :: nread     !<Edge lengths for a NetCDF file.
+    integer(i4_kind),intent(in),optional              :: threading !<Flag telling whether one or multiple ranks will read the file.
 
    !Local variables
-    integer(INT_KIND)                          :: threading_flag !<Flag telling whether one or multiple ranks will read the file.  This defaults to MPP_SINGLE.
+    integer(i4_kind)                          :: threading_flag !<Flag telling whether one or multiple ranks will read the file.  This defaults to MPP_SINGLE.
     type(domainUG),pointer                     :: io_domain      !<Pointer to the unstructured I/O domain.
-    integer(INT_KIND)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
-    integer(INT_KIND),dimension(:),allocatable :: pelist         !<A pelist.
-    integer(INT_KIND)                          :: p              !<Loop variable.
-    logical(INT_KIND)                          :: compute_chksum !<Flag telling whether or not a check-sum of the read-in data is calculated.
-    integer(LONG_KIND)                         :: chk            !<Calculated check-sum for the read in data.
+    integer(i4_kind)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
+    integer(i4_kind),dimension(:),allocatable :: pelist         !<A pelist.
+    integer(i4_kind)                          :: p              !<Loop variable.
+    logical(l4_kind)                          :: compute_chksum !<Flag telling whether or not a check-sum of the read-in data is calculated.
+    integer(i8_kind)                         :: chk            !<Calculated check-sum for the read in data.
 
    !Start the mpp timer.
    !mpp_read_clock is a module variable.
diff --git a/mpp/include/mpp_io_unstructured_write.inc b/mpp/include/mpp_io_unstructured_write.inc
index b9c5883cbd..70cc30c0b2 100644
--- a/mpp/include/mpp_io_unstructured_write.inc
+++ b/mpp/include/mpp_io_unstructured_write.inc
@@ -32,7 +32,7 @@ subroutine mpp_io_unstructured_write_r_1D(funit, &
                                           default_data)
 
    !Inputs/outputs
-    integer(INT_KIND),intent(in)    :: funit        !<A file unit for the to which the data will be written.
+    integer(i4_kind),intent(in)    :: funit        !<A file unit for the to which the data will be written.
     type(fieldtype),intent(inout)   :: field        !<A field whose data will be written.
     type(domainUG),intent(inout)    :: domain       !<An unstructured mpp domain associatd with the inputted file.
     real,dimension(:),intent(inout) :: fdata        !<The data that will be written to the file.
@@ -44,12 +44,12 @@ subroutine mpp_io_unstructured_write_r_1D(funit, &
    !Local variables
     real                                       :: fill           !<Fill value for the inputted field.  This defaults to zero.
     type(domainUG),pointer                     :: io_domain      !<Pointer to the unstructured I/O domain.
-    integer(INT_KIND)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
-    integer(INT_KIND),dimension(:),allocatable :: pelist         !<A pelist.
-    integer(INT_KIND)                          :: nelems         !<Total number of data points (sum(nelems_io)) to be written by the root rank of the pelist.
+    integer(i4_kind)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
+    integer(i4_kind),dimension(:),allocatable :: pelist         !<A pelist.
+    integer(i4_kind)                          :: nelems         !<Total number of data points (sum(nelems_io)) to be written by the root rank of the pelist.
     real,dimension(:),allocatable              :: rbuff          !<Buffer used to gather the data onto the root rank of the pelist.
     real,dimension(:),allocatable              :: cdata          !<Array used to write the data to the file after the gather is performed.
-    integer(INT_KIND)                          :: i              !<Loop variable.
+    integer(i4_kind)                          :: i              !<Loop variable.
 
    !Start the mpp timer.
    !mpp_write_clock is a module variable.
@@ -159,7 +159,7 @@ subroutine mpp_io_unstructured_write_r_2D(funit, &
                                           default_data)
 
    !Inputs/outputs
-    integer(INT_KIND),intent(in)      :: funit        !<A file unit for the to which the data will be written.
+    integer(i4_kind),intent(in)      :: funit        !<A file unit for the to which the data will be written.
     type(fieldtype),intent(inout)     :: field        !<A field whose data will be written
     type(domainUG),intent(inout)      :: domain       !<An unstructured mpp domain associatd with the inputted file.
     real,dimension(:,:),intent(inout) :: fdata        !<The data that will be written to the file.
@@ -171,19 +171,19 @@ subroutine mpp_io_unstructured_write_r_2D(funit, &
    !Local variables
     real                                       :: fill           !<Fill value for the inputted field.  This defaults to zero.
     type(domainUG),pointer                     :: io_domain      !<Pointer to the unstructured I/O domain.
-    integer(INT_KIND)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
-    integer(INT_KIND),dimension(:),allocatable :: pelist         !<A pelist.
-    integer(INT_KIND)                          :: dim_size_1     !<Number of data points in the first dimension (size(fdata,1)).
-    integer(INT_KIND)                          :: dim_size_2     !<Number of data points in the second dimension (size(fdata,2)).
+    integer(i4_kind)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
+    integer(i4_kind),dimension(:),allocatable :: pelist         !<A pelist.
+    integer(i4_kind)                          :: dim_size_1     !<Number of data points in the first dimension (size(fdata,1)).
+    integer(i4_kind)                          :: dim_size_2     !<Number of data points in the second dimension (size(fdata,2)).
     real,dimension(:),allocatable              :: sbuff          !<Buffer used to gather the data onto the root rank of the pelist.
-    integer(INT_KIND)                          :: nelems         !<Total number of unstructured data points (sum(nelems_io) for the root rank of the pelist).
+    integer(i4_kind)                          :: nelems         !<Total number of unstructured data points (sum(nelems_io) for the root rank of the pelist).
     real,dimension(:),allocatable              :: rbuff          !<Buffer used to gather the data onto the root rank of the pelist.
     real,dimension(:,:),allocatable            :: cdata          !<Array used to write the data to the file after the gather is performed.
-    integer(INT_KIND)                          :: offset_r       !<Offset for the rbuff array used to reorder the data before the netCDF write.
-    integer(INT_KIND)                          :: offset_c       !<Offset for the cdata array used to reorder the data before the netCDF write.
-    integer(INT_KIND)                          :: i              !<Loop variable.
-    integer(INT_KIND)                          :: j              !<Loop variable.
-    integer(INT_KIND)                          :: k              !<Loop variable.
+    integer(i4_kind)                          :: offset_r       !<Offset for the rbuff array used to reorder the data before the netCDF write.
+    integer(i4_kind)                          :: offset_c       !<Offset for the cdata array used to reorder the data before the netCDF write.
+    integer(i4_kind)                          :: i              !<Loop variable.
+    integer(i4_kind)                          :: j              !<Loop variable.
+    integer(i4_kind)                          :: k              !<Loop variable.
 
    !Start the mpp timer.
    !mpp_write_clock is a module variable.
@@ -317,7 +317,7 @@ subroutine mpp_io_unstructured_write_r_3D(funit, &
                                           default_data)
 
    !Inputs/outputs
-    integer(INT_KIND),intent(in)        :: funit        !<A file unit for the to which the data will be written.
+    integer(i4_kind),intent(in)        :: funit        !<A file unit for the to which the data will be written.
     type(fieldtype),intent(inout)       :: field        !<A field whose data will be written
     type(domainUG),intent(inout)        :: domain       !<An unstructured mpp domain associatd with the inputted file.
     real,dimension(:,:,:),intent(inout) :: fdata        !<The data that will be written to the file.
@@ -329,21 +329,21 @@ subroutine mpp_io_unstructured_write_r_3D(funit, &
    !Local variables
     real                                       :: fill           !<Fill value for the inputted field.  This defaults to zero.
     type(domainUG),pointer                     :: io_domain      !<Pointer to the unstructured I/O domain.
-    integer(INT_KIND)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
-    integer(INT_KIND),dimension(:),allocatable :: pelist         !<A pelist.
-    integer(INT_KIND)                          :: dim_size_1     !<Number of data points in the first dimension (size(fdata,1)).
-    integer(INT_KIND)                          :: dim_size_2     !<Number of data points in the second dimension (size(fdata,2)).
-    integer(INT_KIND)                          :: dim_size_3     !<Number of data points in the second dimension (size(fdata,3)).
+    integer(i4_kind)                          :: io_domain_npes !<The total number of ranks in an I/O domain pelist.
+    integer(i4_kind),dimension(:),allocatable :: pelist         !<A pelist.
+    integer(i4_kind)                          :: dim_size_1     !<Number of data points in the first dimension (size(fdata,1)).
+    integer(i4_kind)                          :: dim_size_2     !<Number of data points in the second dimension (size(fdata,2)).
+    integer(i4_kind)                          :: dim_size_3     !<Number of data points in the second dimension (size(fdata,3)).
     real,dimension(:),allocatable              :: sbuff          !<Buffer used to gather the data onto the root rank of the pelist.
-    integer(INT_KIND)                          :: nelems         !<Total number of unstructured data points (sum(nelems_io) for the root rank of the pelist).
+    integer(i4_kind)                          :: nelems         !<Total number of unstructured data points (sum(nelems_io) for the root rank of the pelist).
     real,dimension(:),allocatable              :: rbuff          !<Buffer used to gather the data onto the root rank of the pelist.
     real,dimension(:,:,:),allocatable          :: cdata          !<Array used to write the data to the file after the gather is performed.
-    integer(INT_KIND)                          :: offset_r       !<Offset for the rbuff array used to reorder the data before the netCDF write.
-    integer(INT_KIND)                          :: offset_c       !<Offset for the cdata array used to reorder the data before the netCDF write.
-    integer(INT_KIND)                          :: i              !<Loop variable.
-    integer(INT_KIND)                          :: j              !<Loop variable.
-    integer(INT_KIND)                          :: k              !<Loop variable.
-    integer(INT_KIND)                          :: m              !<Loop variable.
+    integer(i4_kind)                          :: offset_r       !<Offset for the rbuff array used to reorder the data before the netCDF write.
+    integer(i4_kind)                          :: offset_c       !<Offset for the cdata array used to reorder the data before the netCDF write.
+    integer(i4_kind)                          :: i              !<Loop variable.
+    integer(i4_kind)                          :: j              !<Loop variable.
+    integer(i4_kind)                          :: k              !<Loop variable.
+    integer(i4_kind)                          :: m              !<Loop variable.
 
    !Start the mpp timer.
    !mpp_write_clock is a module variable.
@@ -485,7 +485,7 @@ subroutine mpp_io_unstructured_write_r_4D(funit, &
                                           default_data)
 
    !Inputs/outputs
-    integer(INT_KIND),intent(in)             :: funit        !<A file unit for the to which the data will be written.
+    integer(i4_kind),intent(in)             :: funit        !<A file unit for the to which the data will be written.
     type(fieldtype),intent(inout)            :: field        !<A field whose data will be written
     type(domainUG),intent(inout)             :: domain       !<An unstructured mpp domain associatd with the inputted file.
     real,dimension(:,:,:,:),intent(inout)    :: fdata        !<The data that will be written to the file.
@@ -497,24 +497,24 @@ subroutine mpp_io_unstructured_write_r_4D(funit, &
    !Local variables
     real                                       :: fill             !<Fill value for the inputted field.  This defaults to zero.
     type(domainUG),pointer                     :: io_domain        !<Pointer to the unstructured I/O domain.
-    integer(INT_KIND)                          :: io_domain_npes   !<The total number of ranks in an I/O domain pelist.
-    integer(INT_KIND),dimension(:),allocatable :: pelist           !<A pelist.
-    integer(INT_KIND),dimension(:),allocatable :: nelems_io        !<Number of grid points in the unstructured dimension for each rank.
-    integer(INT_KIND)                          :: compute_size     !<Size of the unstructured compute domain for the current rank.
-    integer(INT_KIND),dimension(:),allocatable :: dim_size_gather  !<Array used to check that size(fdata,2) and size(fdata,3) (non-unstructured dimensions) are the same for
+    integer(i4_kind)                          :: io_domain_npes   !<The total number of ranks in an I/O domain pelist.
+    integer(i4_kind),dimension(:),allocatable :: pelist           !<A pelist.
+    integer(i4_kind),dimension(:),allocatable :: nelems_io        !<Number of grid points in the unstructured dimension for each rank.
+    integer(i4_kind)                          :: compute_size     !<Size of the unstructured compute domain for the current rank.
+    integer(i4_kind),dimension(:),allocatable :: dim_size_gather  !<Array used to check that size(fdata,2) and size(fdata,3) (non-unstructured dimensions) are the same for
                                                                    !!all ranks in an I/O domain.
-    integer(INT_KIND)                          :: size_fdata_dim_2 !<Number of data points in a non-unstructured dimension (size(fdata,2)).
-    integer(INT_KIND)                          :: size_fdata_dim_3 !<Number of data points in a non-unstructured dimension (size(fdata,3)).
-    integer(INT_KIND)                          :: size_fdata_dim_4 !<Number of data points in a non-unstructured dimension (size(fdata,3)).
-    integer(INT_KIND)                          :: mynelems         !<Number of data points in the unstructured dimension (size(fdata,1)).
+    integer(i4_kind)                          :: size_fdata_dim_2 !<Number of data points in a non-unstructured dimension (size(fdata,2)).
+    integer(i4_kind)                          :: size_fdata_dim_3 !<Number of data points in a non-unstructured dimension (size(fdata,3)).
+    integer(i4_kind)                          :: size_fdata_dim_4 !<Number of data points in a non-unstructured dimension (size(fdata,3)).
+    integer(i4_kind)                          :: mynelems         !<Number of data points in the unstructured dimension (size(fdata,1)).
     real,dimension(:),allocatable              :: sbuff            !<Buffer used to gather the data onto the root rank of the pelist.
-    integer(INT_KIND)                          :: nelems           !<Total number of unstructured data points (sum(nelems_io) for the root rank of the pelist).
+    integer(i4_kind)                          :: nelems           !<Total number of unstructured data points (sum(nelems_io) for the root rank of the pelist).
     real,dimension(:),allocatable              :: rbuff            !<Buffer used to gather the data onto the root rank of the pelist.
     real,dimension(:,:,:,:),allocatable        :: cdata            !<Array used to write the data to the file after the gather is performed.
-    integer(INT_KIND)                          :: i                !<Loop variable.
-    integer(INT_KIND)                          :: j                !<Loop variable.
-    integer(INT_KIND)                          :: k                !<Loop variable.
-    integer(INT_KIND)                          :: n                !<Loop variable.
+    integer(i4_kind)                          :: i                !<Loop variable.
+    integer(i4_kind)                          :: j                !<Loop variable.
+    integer(i4_kind)                          :: k                !<Loop variable.
+    integer(i4_kind)                          :: n                !<Loop variable.
 
    !Start the mpp timer.
    !mpp_write_clock is a module variable.
diff --git a/mpp/include/mpp_io_util.inc b/mpp/include/mpp_io_util.inc
index 67b3358c63..824bcb935e 100644
--- a/mpp/include/mpp_io_util.inc
+++ b/mpp/include/mpp_io_util.inc
@@ -104,7 +104,7 @@
      type(validtype),    intent(out),                 optional :: valid
      real,               intent(out),                 optional :: scale
      real,               intent(out),                 optional :: add
-     integer(LONG_KIND), intent(out),   dimension(:), optional :: checksum
+     integer(i8_kind), intent(out),   dimension(:), optional :: checksum
 
      type(atttype),      intent(inout), dimension(:), optional :: atts
      type(axistype),     intent(inout), dimension(:), optional :: axes
@@ -333,7 +333,7 @@
 !     call mpp_get_times( unit, time_values )
 !   </TEMPLATE>
 !   <IN NAME="unit" TYPE="integer"> </IN>
-!   <INOUT NAME="time_values" TYPE="real(DOUBLE_KIND)" DIM="(:)"> </INOUT>
+!   <INOUT NAME="time_values" TYPE="real(r8_kind)" DIM="(:)"> </INOUT>
 ! </SUBROUTINE>
 
    subroutine mpp_get_times( unit, time_values )
diff --git a/mpp/include/mpp_io_write.inc b/mpp/include/mpp_io_write.inc
index 8187615e72..a3707e9b7f 100644
--- a/mpp/include/mpp_io_write.inc
+++ b/mpp/include/mpp_io_write.inc
@@ -684,7 +684,7 @@
       integer, intent(in), optional :: pack
       character(len=*), intent(in), optional :: time_method
       character(len=*), intent(in), optional :: standard_name
-      integer(LONG_KIND), dimension(:), intent(in), optional :: checksum
+      integer(i8_kind), dimension(:), intent(in), optional :: checksum
 !this array is required because of f77 binding on netCDF interface
       integer, allocatable :: axis_id(:)
       real :: a, b
@@ -952,46 +952,46 @@
 !pack was only meaningful for FP numbers, but is now extended by the ival branch of this routine
           if( PRESENT(pack) )then
              if( pack== 0 ) then !! here be dragons, use ival branch!...
-                if( KIND(rval).EQ.DOUBLE_KIND )then
+                if( KIND(rval).EQ.r8_kind )then
                    call mpp_error( FATAL, &
                         'WRITE_ATTRIBUTE_NETCDF: attempting to write internal NF_INT, currently int32, as double.' )
-                else if( KIND(rval).EQ.FLOAT_KIND )then
+                else if( KIND(rval).EQ.r4_kind )then
                    call mpp_error( FATAL, &
                         'WRITE_ATTRIBUTE_NETCDF: attempting to write internal NF_INT, currently int32, as float.' )
                 end if
              else if( pack.EQ.1 )then
-                  if( KIND(rval).EQ.DOUBLE_KIND )then
-                      error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_DOUBLE, size(rval(:)), real(rval,kind=DOUBLE_KIND) )
-                  else if( KIND(rval).EQ.FLOAT_KIND )then
+                  if( KIND(rval).EQ.r8_kind )then
+                      error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_DOUBLE, size(rval(:)), real(rval, kind=r8_kind))
+                  else if( KIND(rval).EQ.r4_kind )then
                       call mpp_error( WARNING, &
                            'WRITE_ATTRIBUTE_NETCDF: attempting to write internal 32-bit real as external 64-bit.' )
-                      error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_DOUBLE, size(rval(:)), real(rval,kind=FLOAT_KIND) )
+                      error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_DOUBLE, size(rval(:)), real(rval, kind=r4_kind))
                   end if
                   call netcdf_err( error, mpp_file(unit), string=' Attribute='//name )
               else if( pack.EQ.2 )then
-                  if( KIND(rval).EQ.DOUBLE_KIND )then
-                      error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_FLOAT,  size(rval(:)), real(rval,kind=DOUBLE_KIND) )
-                  else if( KIND(rval).EQ.FLOAT_KIND )then
-                      error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_FLOAT,  size(rval(:)), real(rval,kind=FLOAT_KIND) )
+                  if( KIND(rval).EQ.r8_kind )then
+                      error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_FLOAT,  size(rval(:)), real(rval, kind=r8_kind))
+                  else if( KIND(rval).EQ.r4_kind )then
+                      error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_FLOAT,  size(rval(:)), real(rval, kind=r4_kind))
                   end if
                   call netcdf_err( error, mpp_file(unit), string=' Attribute='//name )
               else if( pack.EQ.4 )then
                   allocate( rval_i(size(rval(:))) )
                   rval_i = rval
-                  if( KIND(rval).EQ.DOUBLE_KIND )then
-                      error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_SHORT,  size(rval_i(:)), real(rval,kind=DOUBLE_KIND) )
-                  else if( KIND(rval).EQ.FLOAT_KIND )then
-                      error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_SHORT,  size(rval_i(:)), real(rval,kind=FLOAT_KIND) )
+                  if( KIND(rval).EQ.r8_kind )then
+                      error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_SHORT,  size(rval_i(:)), real(rval, kind=r8_kind))
+                  else if( KIND(rval).EQ.r4_kind )then
+                      error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_SHORT,  size(rval_i(:)), real(rval, kind=r4_kind))
                   end if
                   call netcdf_err( error, mpp_file(unit), string=' Attribute='//name )
                   deallocate(rval_i)
               else if( pack.EQ.8 )then
                   allocate( rval_i(size(rval(:))) )
                   rval_i = rval
-                  if( KIND(rval).EQ.DOUBLE_KIND )then
-                      error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_BYTE,   size(rval_i(:)), real(rval,kind=DOUBLE_KIND) )
-                  else if( KIND(rval).EQ.FLOAT_KIND )then
-                      error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_BYTE,   size(rval_i(:)), real(rval,kind=FLOAT_KIND) )
+                  if( KIND(rval).EQ.r8_kind )then
+                      error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_BYTE,   size(rval_i(:)), real(rval, kind=r8_kind))
+                  else if( KIND(rval).EQ.r4_kind )then
+                      error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_BYTE,   size(rval_i(:)), real(rval, kind=r4_kind))
                   end if
                   call netcdf_err( error, mpp_file(unit), string=' Attribute='//name )
                   deallocate(rval_i)
@@ -1000,17 +1000,17 @@
               end if
           else
 !default is to write FLOATs (32-bit)
-              if( KIND(rval).EQ.DOUBLE_KIND )then
-                  error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_FLOAT,  size(rval(:)), real(rval,kind=DOUBLE_KIND) )
-              else if( KIND(rval).EQ.FLOAT_KIND )then
-                  error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_FLOAT,  size(rval(:)), real(rval,kind=FLOAT_KIND) )
+              if( KIND(rval).EQ.r8_kind )then
+                  error = NF_PUT_ATT_DOUBLE( mpp_file(unit)%ncid, id, name, NF_FLOAT,  size(rval(:)), real(rval, kind=r8_kind))
+              else if( KIND(rval).EQ.r4_kind )then
+                  error = NF_PUT_ATT_REAL  ( mpp_file(unit)%ncid, id, name, NF_FLOAT,  size(rval(:)), real(rval, kind=r4_kind))
               end if
               call netcdf_err( error, mpp_file(unit), string=' Attribute='//name )
           end if
       else if( PRESENT(ival) )then
           if( PRESENT(pack) ) then
              if (pack ==0) then
-                if (KIND(ival).EQ.LONG_KIND ) then
+                if (KIND(ival).EQ.i8_kind) then
                    call mpp_error(FATAL,'only use NF_INTs with pack=0 for now')
                 end if
                 error = NF_PUT_ATT_INT( mpp_file(unit)%ncid, id, name, NF_INT, size(ival(:)), ival ) !!XXX int32_t..
@@ -1062,7 +1062,7 @@
 !  mpp_write( unit, field, data, tstamp )                              !
 !     integer, intent(in) :: unit                                      !
 !     type(fieldtype), intent(in) :: field                             !
-!     real(DOUBLE_KIND), optional :: tstamp                            !
+!     real(r8_kind), optional :: tstamp                            !
 !     data is real and can be scalar or of rank 1-3.                   !
 !                                                                      !
 ! For distributed data, use                                            !
@@ -1071,7 +1071,7 @@
 !     integer, intent(in) :: unit                                      !
 !     type(fieldtype), intent(in) :: field                             !
 !     type(domain2D), intent(in) :: domain                             !
-!     real(DOUBLE_KIND), optional :: tstamp                            !
+!     real(r8_kind), optional :: tstamp                            !
 !     data is real and can be of rank 2 or 3.                          !
 !                                                                      !
 !  mpp_write( unit, axis )                                             !
@@ -1095,22 +1095,20 @@
 #undef MPP_WRITE_2DDECOMP_4D_
 #define MPP_WRITE_2DDECOMP_4D_ mpp_write_2ddecomp_r4d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_write_2Ddecomp.h>
 
-#ifdef OVERLOAD_R8
 #undef WRITE_RECORD_
-#define WRITE_RECORD_ write_record_r8
+#define WRITE_RECORD_ write_record_r4
 #undef MPP_WRITE_2DDECOMP_2D_
-#define MPP_WRITE_2DDECOMP_2D_ mpp_write_2ddecomp_r2d_r8
+#define MPP_WRITE_2DDECOMP_2D_ mpp_write_2ddecomp_r2d_r4
 #undef MPP_WRITE_2DDECOMP_3D_
-#define MPP_WRITE_2DDECOMP_3D_ mpp_write_2ddecomp_r3d_r8
+#define MPP_WRITE_2DDECOMP_3D_ mpp_write_2ddecomp_r3d_r4
 #undef MPP_WRITE_2DDECOMP_4D_
-#define MPP_WRITE_2DDECOMP_4D_ mpp_write_2ddecomp_r4d_r8
+#define MPP_WRITE_2DDECOMP_4D_ mpp_write_2ddecomp_r4d_r4
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_write_2Ddecomp.h>
-#endif
 
 #undef MPP_WRITE_COMPRESSED_1D_
 #define MPP_WRITE_COMPRESSED_1D_ mpp_write_compressed_r1d
@@ -1574,7 +1572,7 @@
       integer, intent(in), optional :: pack
       character(len=*), intent(in), optional :: time_method
       character(len=*), intent(in), optional :: standard_name
-      integer(LONG_KIND), dimension(:), intent(in), optional :: checksum
+      integer(i8_kind), dimension(:), intent(in), optional :: checksum
 !this array is required because of f77 binding on netCDF interface
       integer, allocatable :: axis_id(:)
       real :: a, b
@@ -1648,3 +1646,4 @@
 
       return
     end subroutine fillin_fieldtype
+
diff --git a/mpp/include/mpp_read_2Ddecomp.h b/mpp/include/mpp_read_2Ddecomp.h
index e16f2c0e0d..5ba8c245b2 100644
--- a/mpp/include/mpp_read_2Ddecomp.h
+++ b/mpp/include/mpp_read_2Ddecomp.h
@@ -23,14 +23,14 @@
       MPP_TYPE_,        intent(inout) :: data(nwords)
       integer,         intent(in)    :: start(:), axsiz(:)
 
-      integer(SHORT_KIND) :: i2vals(nwords)
+      integer(i2_kind) :: i2vals(nwords)
 !rab used in conjunction with transfer intrinsic to determine size of a variable
       integer(KIND=1) :: one_byte(8)
       integer         :: word_sz
-      integer(INT_KIND) :: ivals(nwords)
-      real(FLOAT_KIND) :: rvals(nwords)
+      integer(i4_kind) :: ivals(nwords)
+      real(r4_kind) :: rvals(nwords)
 
-      real(DOUBLE_KIND) :: r8vals(nwords)
+      real(r8_kind) :: r8vals(nwords)
       pointer( ptr1, i2vals )
       pointer( ptr2, ivals )
       pointer( ptr3, rvals )
diff --git a/mpp/include/mpp_read_compressed.h b/mpp/include/mpp_read_compressed.h
index 5dcce1b73f..d281a9de73 100644
--- a/mpp/include/mpp_read_compressed.h
+++ b/mpp/include/mpp_read_compressed.h
@@ -44,7 +44,7 @@
       integer :: npes, p, threading_flag
       type(domain2d), pointer :: io_domain=>NULL()
       logical :: compute_chksum,print_compressed_chksum
-      integer(LONG_KIND) ::chk
+      integer(i8_kind) ::chk
 
       call mpp_clock_begin(mpp_read_clock)
 
@@ -138,7 +138,7 @@
       integer :: npes, p, threading_flag
       type(domain2d), pointer :: io_domain=>NULL()
       logical :: compute_chksum,print_compressed_chksum
-      integer(LONG_KIND) ::chk
+      integer(i8_kind) ::chk
 
       call mpp_clock_begin(mpp_read_clock)
 
diff --git a/mpp/include/mpp_scatter.h b/mpp/include/mpp_scatter.h
index b6e669e3e8..ae96ae24df 100644
--- a/mpp/include/mpp_scatter.h
+++ b/mpp/include/mpp_scatter.h
@@ -16,6 +16,27 @@
 !* You should have received a copy of the GNU Lesser General Public
 !* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
 !***********************************************************************
+  ! <SUBROUTINE NAME="MPP_SCATTER_PELIST_2D_">
+  !   <OVERVIEW>
+  !     Scatter data from one pe to the specified pes.
+  !   </OVERVIEW>
+  !   <TEMPLATE>
+  !     SUBROUTINE MPP_SCATTER_PELIST_2D_(is, ie, js, je, pelist, array_seg, data, is_root_pe, &
+  !					  ishift, jshift)
+  !   </TEMPLATE>
+  !   <DESCRIPTION>
+  !		   Scatter (ie - is) * (je - js) contiguous elements of array data from the designated root pe
+  !		   into contigous members of array segment in each pe that is included in the pelist argument.
+  !   </DESCRIPTION>
+  !   <IN NAME="is, ie" TYPE="INTEGER">Start and end index of the first dimension of the segment array</IN>
+  !   <IN NAME="js, je" TYPE="INTEGER">Start and end index of the second dimension of the segment array</IN>
+  !   <IN NAME="pelist" TYPE="INTEGER()">The PE list of of target pes, Needs to be in monotonic increasing order.
+  !		The root pe is allowed to be included (see input is_root_pe). If a pe is absent in this list then
+  !             its segment array is not updated. </IN>
+  !   <IN NAME="array_seg" TYPE="REAL(:)">The 2D array that the data is to be copied into</IN>
+  !   <IN NAME="data" TYPE="REAL(:)">The source array.</IN>
+  !   <IN NAME="is_root_pe" TYPE="LOGICAL" > True if the calee is root pe, false otherwise. </IN>
+  !   <IN NAME="ishift, jshift" TYPE="INTEGER" > Offsets specifying the first element in data array.</IN>
 subroutine MPP_SCATTER_PELIST_2D_(is, ie, js, je, pelist, array_seg, data, is_root_pe, &
                                   ishift, jshift)
    integer,                           intent(in)    :: is, ie, js, je
diff --git a/mpp/include/mpp_transmit_nocomm.h b/mpp/include/mpp_transmit_nocomm.h
index 5a5fd36a4d..00df9b40ba 100644
--- a/mpp/include/mpp_transmit_nocomm.h
+++ b/mpp/include/mpp_transmit_nocomm.h
@@ -50,7 +50,7 @@
 
       integer :: i, outunit
       MPP_TYPE_, allocatable, save :: local_data(:) !local copy used by non-parallel code (no SHMEM or MPI)
-      integer(LONG_KIND),     save :: get_data_addr=-9999
+      integer(i8_kind),     save :: get_data_addr=-9999
       MPP_TYPE_                    :: get_data_local(get_len_nocomm)
       pointer(ptr_get_data, get_data_local)
 
diff --git a/mpp/include/mpp_unstruct_domain.inc b/mpp/include/mpp_unstruct_domain.inc
index c936973156..b8c1f8d9d6 100644
--- a/mpp/include/mpp_unstruct_domain.inc
+++ b/mpp/include/mpp_unstruct_domain.inc
@@ -650,7 +650,7 @@ function mpp_domain_UG_is_tile_root_pe(domain) result(is_root)
 
    !<Inputs/Outputs
     type(domainUG),intent(in) :: domain
-    logical(INT_KIND)         :: is_root
+    logical(l8_kind)         :: is_root
 
     if (domain%pe .eq. domain%tile_root_pe) then
         is_root = .true.
@@ -669,7 +669,7 @@ function mpp_get_io_domain_UG_layout(domain) result(io_layout)
 
    !<Inputs/Outputs
     type(domainUG),intent(in) :: domain
-    integer(INT_KIND)         :: io_layout
+    integer(i4_kind)         :: io_layout
 
     io_layout = domain%io_layout
 
@@ -710,7 +710,7 @@ subroutine mpp_deallocate_domainUG(domain)
     type(domainUG),intent(inout) :: domain
 
    !<Local variables
-    integer(INT_KIND) :: i !<Loop variable.
+    integer(i4_kind) :: i !<Loop variable.
 
     if (associated(domain%list)) then
         deallocate(domain%list)
@@ -792,7 +792,7 @@ end subroutine mpp_deallocate_domainUG
   end function mpp_domainUG_ne
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #undef mpp_pass_SG_to_UG_2D_
 #define mpp_pass_SG_to_UG_2D_ mpp_pass_SG_to_UG_r8_2d
 #undef mpp_pass_SG_to_UG_3D_
@@ -803,9 +803,8 @@ end subroutine mpp_deallocate_domainUG
 #define mpp_pass_UG_to_SG_3D_ mpp_pass_UG_to_SG_r8_3d
 #include <mpp_unstruct_pass_data.h>
 
-#ifdef OVERLOAD_R4
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #undef mpp_pass_SG_to_UG_2D_
 #define mpp_pass_SG_to_UG_2D_ mpp_pass_SG_to_UG_r4_2d
 #undef mpp_pass_SG_to_UG_3D_
@@ -815,10 +814,9 @@ end subroutine mpp_deallocate_domainUG
 #undef mpp_pass_UG_to_SG_3D_
 #define mpp_pass_UG_to_SG_3D_ mpp_pass_UG_to_SG_r4_3d
 #include <mpp_unstruct_pass_data.h>
-#endif
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #undef mpp_pass_SG_to_UG_2D_
 #define mpp_pass_SG_to_UG_2D_ mpp_pass_SG_to_UG_i4_2d
 #undef mpp_pass_SG_to_UG_3D_
@@ -830,7 +828,7 @@ end subroutine mpp_deallocate_domainUG
 #include <mpp_unstruct_pass_data.h>
 
 #undef MPP_TYPE_
-#define MPP_TYPE_ logical(INT_KIND)
+#define MPP_TYPE_ logical(i4_kind)
 #undef mpp_pass_SG_to_UG_2D_
 #define mpp_pass_SG_to_UG_2D_ mpp_pass_SG_to_UG_l4_2d
 #undef mpp_pass_SG_to_UG_3D_
@@ -850,10 +848,9 @@ end subroutine mpp_deallocate_domainUG
 #undef MPP_GLOBAL_FIELD_UG_5D_
 #define MPP_GLOBAL_FIELD_UG_5D_ mpp_global_field2D_ug_r8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(DOUBLE_KIND)
+#define MPP_TYPE_ real(r8_kind)
 #include <mpp_global_field_ug.h>
 
-#ifndef no_8byte_integers
 #undef MPP_GLOBAL_FIELD_UG_2D_
 #define MPP_GLOBAL_FIELD_UG_2D_ mpp_global_field2D_ug_i8_2d
 #undef MPP_GLOBAL_FIELD_UG_3D_
@@ -863,11 +860,9 @@ end subroutine mpp_deallocate_domainUG
 #undef MPP_GLOBAL_FIELD_UG_5D_
 #define MPP_GLOBAL_FIELD_UG_5D_ mpp_global_field2D_ug_i8_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(LONG_KIND)
+#define MPP_TYPE_ integer(i8_kind)
 #include <mpp_global_field_ug.h>
-#endif
 
-#ifdef OVERLOAD_R4
 #undef MPP_GLOBAL_FIELD_UG_2D_
 #define MPP_GLOBAL_FIELD_UG_2D_ mpp_global_field2D_ug_r4_2d
 #undef MPP_GLOBAL_FIELD_UG_3D_
@@ -877,9 +872,8 @@ end subroutine mpp_deallocate_domainUG
 #undef MPP_GLOBAL_FIELD_UG_5D_
 #define MPP_GLOBAL_FIELD_UG_5D_ mpp_global_field2D_ug_r4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ real(FLOAT_KIND)
+#define MPP_TYPE_ real(r4_kind)
 #include <mpp_global_field_ug.h>
-#endif
 
 #undef MPP_GLOBAL_FIELD_UG_2D_
 #define MPP_GLOBAL_FIELD_UG_2D_ mpp_global_field2D_ug_i4_2d
@@ -890,5 +884,5 @@ end subroutine mpp_deallocate_domainUG
 #undef MPP_GLOBAL_FIELD_UG_5D_
 #define MPP_GLOBAL_FIELD_UG_5D_ mpp_global_field2D_ug_i4_5d
 #undef MPP_TYPE_
-#define MPP_TYPE_ integer(INT_KIND)
+#define MPP_TYPE_ integer(i4_kind)
 #include <mpp_global_field_ug.h>
diff --git a/mpp/include/mpp_update_domains2D.h b/mpp/include/mpp_update_domains2D.h
index aecd6c10ff..759580f547 100644
--- a/mpp/include/mpp_update_domains2D.h
+++ b/mpp/include/mpp_update_domains2D.h
@@ -51,7 +51,7 @@
 
       integer                 :: update_position, update_whalo, update_ehalo, update_shalo, update_nhalo, ntile
 
-      integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),save :: f_addrs=-9999
+      integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),save :: f_addrs=-9999
       integer          :: tile, max_ntile
       character(len=3) :: text
       logical          :: set_mismatch, is_complete
@@ -248,14 +248,14 @@
       type(DomainCommunicator2D),pointer,save :: d_comm =>NULL()
       logical                       :: do_redist,free_comm
       integer                       :: lsize
-      integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: l_addrs_in=-9999, l_addrs_out=-9999
+      integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: l_addrs_in=-9999, l_addrs_out=-9999
       integer, save :: isize_in=0,jsize_in=0,ke_in=0,l_size=0
       integer, save :: isize_out=0,jsize_out=0,ke_out=0
       logical       :: set_mismatch
       integer       :: ke
       character(len=2) :: text
       MPP_TYPE_ :: d_type
-      integer(LONG_KIND) :: floc_in, floc_out
+      integer(i8_kind) :: floc_in, floc_out
 
       floc_in = 0
       floc_out = 0
@@ -418,7 +418,7 @@
       integer                                :: grid_offset_type
       logical                                :: exchange_uv
 
-      integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),save :: f_addrsx=-9999, f_addrsy=-9999
+      integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),save :: f_addrsx=-9999, f_addrsy=-9999
       logical          :: do_update, is_complete
       integer, save    :: isize(2)=0,jsize(2)=0,ke=0,l_size=0, offset_type=0, list=0
       integer, save    :: whalosz, ehalosz, shalosz, nhalosz
diff --git a/mpp/include/mpp_update_domains2D_ad.h b/mpp/include/mpp_update_domains2D_ad.h
index ee46e985ba..f1ef55eae0 100644
--- a/mpp/include/mpp_update_domains2D_ad.h
+++ b/mpp/include/mpp_update_domains2D_ad.h
@@ -51,7 +51,7 @@
 
       integer                 :: update_position, update_whalo, update_ehalo, update_shalo, update_nhalo, ntile
 
-      integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),save :: f_addrs=-9999
+      integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),save :: f_addrs=-9999
       integer          :: tile, max_ntile
       character(len=3) :: text
       logical          :: set_mismatch, is_complete
@@ -257,7 +257,7 @@
       integer                                :: grid_offset_type
       logical                                :: exchange_uv
 
-      integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),save :: f_addrsx=-9999, f_addrsy=-9999
+      integer(i8_kind),dimension(MAX_DOMAIN_FIELDS, MAX_TILES),save :: f_addrsx=-9999, f_addrsy=-9999
       logical          :: do_update, is_complete
       integer, save    :: isize(2)=0,jsize(2)=0,ke=0,l_size=0, offset_type=0, list=0
       integer, save    :: whalosz, ehalosz, shalosz, nhalosz
diff --git a/mpp/include/mpp_update_domains2D_nonblock.h b/mpp/include/mpp_update_domains2D_nonblock.h
index 9d96d0fe52..37df69fa5f 100644
--- a/mpp/include/mpp_update_domains2D_nonblock.h
+++ b/mpp/include/mpp_update_domains2D_nonblock.h
@@ -63,7 +63,7 @@ function MPP_START_UPDATE_DOMAINS_3D_( field, domain, flags, position, &
   integer, save              :: pos, whalosz, ehalosz, shalosz, nhalosz, update_flags_saved
   character(len=128)         :: text, field_name
   integer, save              :: ke_list(MAX_DOMAIN_FIELDS, MAX_TILES)=0
-  integer(LONG_KIND), save   :: f_addrs(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
+  integer(i8_kind), save   :: f_addrs(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
   type(overlapSpec), pointer :: update => NULL()
   MPP_TYPE_                  :: d_type
 
@@ -329,7 +329,7 @@ subroutine MPP_COMPLETE_UPDATE_DOMAINS_3D_( id_update, field, domain, flags, pos
   integer                    :: ke_max
   integer, save              :: list=0, l_size=0
   integer, save              :: ke_list(MAX_DOMAIN_FIELDS, MAX_TILES)=0
-  integer(LONG_KIND), save   :: f_addrs(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
+  integer(i8_kind), save   :: f_addrs(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
   character(len=128)         :: text
   MPP_TYPE_        :: d_type
 
@@ -538,8 +538,8 @@ function MPP_START_UPDATE_DOMAINS_3D_V_( fieldx, fieldy, domain, flags, gridtype
   integer,            save    :: whalosz, ehalosz, shalosz, nhalosz
   integer,            save    :: isize(2)=0,jsize(2)=0,l_size=0, offset_type=0, list=0
   integer,            save    :: ke_list (MAX_DOMAIN_FIELDS, MAX_TILES)=0
-  integer(LONG_KIND), save    :: f_addrsx(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
-  integer(LONG_KIND), save    :: f_addrsy(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
+  integer(i8_kind), save    :: f_addrsx(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
+  integer(i8_kind), save    :: f_addrsy(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
   type(overlapSpec),  pointer :: updatex => NULL()
   type(overlapSpec),  pointer :: updatey => NULL()
   MPP_TYPE_                   :: d_type
@@ -853,8 +853,8 @@ subroutine MPP_COMPLETE_UPDATE_DOMAINS_3D_V_( id_update, fieldx, fieldy, domain,
   character(len=128)          :: text
   integer,            save    :: l_size=0, list=0
   integer,            save    :: ke_list (MAX_DOMAIN_FIELDS, MAX_TILES)=0
-  integer(LONG_KIND), save    :: f_addrsx(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
-  integer(LONG_KIND), save    :: f_addrsy(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
+  integer(i8_kind), save    :: f_addrsx(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
+  integer(i8_kind), save    :: f_addrsy(MAX_DOMAIN_FIELDS, MAX_TILES)=-9999
   type(overlapSpec),  pointer :: updatex => NULL()
   type(overlapSpec),  pointer :: updatey => NULL()
   MPP_TYPE_                   :: d_type
diff --git a/mpp/include/mpp_update_nest_domains.h b/mpp/include/mpp_update_nest_domains.h
index c75d22d6e8..99d8242da3 100644
--- a/mpp/include/mpp_update_nest_domains.h
+++ b/mpp/include/mpp_update_nest_domains.h
@@ -19,19 +19,30 @@
 !***********************************************************************
 subroutine MPP_UPDATE_NEST_FINE_2D_(field, nest_domain, wbuffer, ebuffer, sbuffer, nbuffer, &
                                     nest_level, flags, complete, position, extra_halo, name, tile_count)
-      MPP_TYPE_,             intent(in)      :: field(:,:)
-      type(nest_domain_type), intent(inout)  :: nest_domain
-      MPP_TYPE_,             intent(inout)   :: wbuffer(:,:)
-      MPP_TYPE_,             intent(inout)   :: ebuffer(:,:)
-      MPP_TYPE_,             intent(inout)   :: sbuffer(:,:)
-      MPP_TYPE_,             intent(inout)   :: nbuffer(:,:)
-      integer,          intent(in)           :: nest_level
-      integer,          intent(in), optional :: flags
-      logical,          intent(in), optional :: complete
-      integer,          intent(in), optional :: position
-      integer,          intent(in), optional :: extra_halo
-      character(len=*), intent(in), optional :: name
-      integer,          intent(in), optional :: tile_count
+      MPP_TYPE_,             intent(in)      :: field(:,:) !< field on the model grid
+      type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                            !! between fine and coarse grid.
+      MPP_TYPE_,             intent(inout)   :: wbuffer(:,:) !< west side buffer to be filled
+                                                             !! with data on coarse grid.
+      MPP_TYPE_,             intent(inout)   :: ebuffer(:,:) !< east side buffer to be filled
+                                                             !! with data on coarse grid.
+      MPP_TYPE_,             intent(inout)   :: sbuffer(:,:) !< south side buffer to be filled
+                                                             !! with data on coarse grid.
+      MPP_TYPE_,             intent(inout)   :: nbuffer(:,:) !< north side buffer to be filled
+                                                             !! with data on coarse grid.
+      integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+      integer,          intent(in), optional :: flags !< Specify the direction of fine grid halo buffer to be filled.
+                                                      !! Default value is XUPDATE+YUPDATE.
+      logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+                                                         !! Default value is .true.
+      integer,          intent(in), optional :: position !< Cell position. It value should be
+                                                         !! CENTER, EAST, CORNER, or NORTH. Default is CENTER.
+      integer,          intent(in), optional :: extra_halo !< extra halo for passing data
+                                                           !! from coarse grid to fine grid.
+                                                           !! Default is 0 and currently only support extra_halo = 0.
+      character(len=*), intent(in), optional :: name !< Name of the nest domain.
+      integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                           !! default is 1 and currently only support tile_count = 1.
 
       MPP_TYPE_ :: field3D(size(field,1),size(field,2),1)
       MPP_TYPE_ :: wbuffer3D(size(wbuffer,1),size(wbuffer,2),1)
@@ -58,27 +69,38 @@ end subroutine MPP_UPDATE_NEST_FINE_2D_
 
 subroutine MPP_UPDATE_NEST_FINE_3D_(field, nest_domain, wbuffer, sbuffer, ebuffer, nbuffer, &
                                     nest_level, flags, complete, position, extra_halo, name, tile_count)
-    MPP_TYPE_,             intent(in)      :: field(:,:,:)
-    type(nest_domain_type), intent(inout)  :: nest_domain
-    MPP_TYPE_,             intent(inout)   :: wbuffer(:,:,:)
-    MPP_TYPE_,             intent(inout)   :: ebuffer(:,:,:)
-    MPP_TYPE_,             intent(inout)   :: sbuffer(:,:,:)
-    MPP_TYPE_,             intent(inout)   :: nbuffer(:,:,:)
-    integer,          intent(in)           :: nest_level
-    integer,          intent(in), optional :: flags
-    logical,          intent(in), optional :: complete
-    integer,          intent(in), optional :: position
-    integer,          intent(in), optional :: extra_halo
-    character(len=*), intent(in), optional :: name
-    integer,          intent(in), optional :: tile_count
+    MPP_TYPE_,             intent(in)      :: field(:,:,:) !< field on the model grid
+    type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                          !! between fine and coarse grid.
+    MPP_TYPE_,             intent(inout)   :: wbuffer(:,:,:) !< west side buffer to be filled
+                                                             !! with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: ebuffer(:,:,:) !< east side buffer to be filled
+                                                             !! with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: sbuffer(:,:,:) !< south side buffer to be filled
+                                                             !! with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: nbuffer(:,:,:) !< north side buffer to be filled
+                                                             !! with data on coarse grid.
+    integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+    integer,          intent(in), optional :: flags !< Specify the direction of fine grid halo buffer to be filled.
+                                                    !! Default value is XUPDATE+YUPDATE.
+    logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+                                                       !! Default value is .true.
+    integer,          intent(in), optional :: position !< Cell position. It value should be
+                                                       !! CENTER, EAST, CORNER, or NORTH. Default is CENTER.
+    integer,          intent(in), optional :: extra_halo !< extra halo for passing data
+                                                         !! from coarse grid to fine grid.
+                                                         !! Default is 0 and currently only support extra_halo = 0.
+    character(len=*), intent(in), optional :: name !< Name of the nest domain.
+    integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                         !! default is 1 and currently only support tile_count = 1.
 
    MPP_TYPE_        :: d_type
    type(nestSpec), pointer :: update=>NULL()
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: f_addrs=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: wb_addrs=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: eb_addrs=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: sb_addrs=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: nb_addrs=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: f_addrs=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: wb_addrs=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: eb_addrs=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: sb_addrs=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: nb_addrs=-9999
    character(len=3) :: text
    logical          :: is_complete, set_mismatch
    integer          :: tile
@@ -184,19 +206,30 @@ end subroutine MPP_UPDATE_NEST_FINE_3D_
 !###############################################################################
 subroutine MPP_UPDATE_NEST_FINE_4D_(field, nest_domain, wbuffer, ebuffer, sbuffer, nbuffer, &
                                     nest_level, flags, complete, position, extra_halo, name, tile_count)
-      MPP_TYPE_,             intent(in)      :: field(:,:,:,:)
-      type(nest_domain_type), intent(inout)  :: nest_domain
-      MPP_TYPE_,             intent(inout)   :: wbuffer(:,:,:,:)
-      MPP_TYPE_,             intent(inout)   :: ebuffer(:,:,:,:)
-      MPP_TYPE_,             intent(inout)   :: sbuffer(:,:,:,:)
-      MPP_TYPE_,             intent(inout)   :: nbuffer(:,:,:,:)
-      integer,          intent(in)           :: nest_level
-      integer,          intent(in), optional :: flags
-      logical,          intent(in), optional :: complete
-      integer,          intent(in), optional :: position
-      integer,          intent(in), optional :: extra_halo
-      character(len=*), intent(in), optional :: name
-      integer,          intent(in), optional :: tile_count
+      MPP_TYPE_,             intent(in)      :: field(:,:,:,:) !< field on the model grid
+      type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                            !! between fine and coarse grid.
+      MPP_TYPE_,             intent(inout)   :: wbuffer(:,:,:,:) !< west side buffer to be filled
+                                                                 !! with data on coarse grid.
+      MPP_TYPE_,             intent(inout)   :: ebuffer(:,:,:,:) !< east side buffer to be filled
+                                                                 !! with data on coarse grid.
+      MPP_TYPE_,             intent(inout)   :: sbuffer(:,:,:,:) !< south side buffer to be filled
+                                                                 !! with data on coarse grid.
+      MPP_TYPE_,             intent(inout)   :: nbuffer(:,:,:,:) !< north side buffer to be filled
+                                                                 !! with data on coarse grid.
+      integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+      integer,          intent(in), optional :: flags  !< Specify the direction of fine grid halo buffer to be filled.
+                                                    !! Default value is XUPDATE+YUPDATE.
+      logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+                                                         !! Default value is .true.
+      integer,          intent(in), optional :: position !< Cell position. It value should be
+                                                         !! CENTER, EAST, CORNER, or NORTH. Default is CENTER.
+      integer,          intent(in), optional :: extra_halo !< extra halo for passing data
+                                                           !! from coarse grid to fine grid.
+                                                           !! Default is 0 and currently only support extra_halo = 0.
+      character(len=*), intent(in), optional :: name !< Name of the nest domain.
+      integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                           !! default is 1 and currently only support tile_count = 1.
 
       MPP_TYPE_ :: field3D(size(field,1),size(field,2),size(field,3)*size(field,4))
       MPP_TYPE_ :: wbuffer3D(size(wbuffer,1),size(wbuffer,2),size(wbuffer,3)*size(wbuffer,4))
@@ -226,19 +259,29 @@ end subroutine MPP_UPDATE_NEST_FINE_4D_
 subroutine MPP_UPDATE_NEST_FINE_2D_V_(fieldx, fieldy, nest_domain, wbufferx, wbuffery, sbufferx, sbuffery, &
                                       ebufferx, ebuffery, nbufferx, nbuffery, nest_level, &
                                       flags, gridtype, complete, extra_halo, name, tile_count)
-    MPP_TYPE_,             intent(in)      :: fieldx(:,:), fieldy(:,:)
-    type(nest_domain_type), intent(inout)  :: nest_domain
-    MPP_TYPE_,             intent(inout)   :: wbufferx(:,:), wbuffery(:,:)
-    MPP_TYPE_,             intent(inout)   :: ebufferx(:,:), ebuffery(:,:)
-    MPP_TYPE_,             intent(inout)   :: sbufferx(:,:), sbuffery(:,:)
-    MPP_TYPE_,             intent(inout)   :: nbufferx(:,:), nbuffery(:,:)
-    integer,          intent(in)           :: nest_level
-    integer,          intent(in), optional :: flags
-    logical,          intent(in), optional :: complete
+    MPP_TYPE_,             intent(in)      :: fieldx(:,:), fieldy(:,:) !< field x and y components on the model grid
+    type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                          !! between fine and coarse grid.
+    MPP_TYPE_,             intent(inout)   :: wbufferx(:,:), wbuffery(:,:) !< west side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: ebufferx(:,:), ebuffery(:,:) !< east side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: sbufferx(:,:), sbuffery(:,:) !< south side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: nbufferx(:,:), nbuffery(:,:) !< north side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+    integer,          intent(in), optional :: flags !< Specify the direction of fine grid halo buffer to be filled.
+                                                    !! Default value is XUPDATE+YUPDATE.
+    logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+                                                       !! Default value is .true.
     integer,          intent(in), optional :: gridtype
-    integer,          intent(in), optional :: extra_halo
-    character(len=*), intent(in), optional :: name
-    integer,          intent(in), optional :: tile_count
+    integer,          intent(in), optional :: extra_halo !< extra halo for passing data
+                                                         !! from coarse grid to fine grid.
+                                                         !! Default is 0 and currently only support extra_halo = 0.
+    character(len=*), intent(in), optional :: name !< Name of the nest domain.
+    integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                         !! default is 1 and currently only support tile_count = 1.
 
     MPP_TYPE_ :: field3Dx(size(fieldx,1),size(fieldx,2),1)
     MPP_TYPE_ :: field3Dy(size(fieldy,1),size(fieldy,2),1)
@@ -281,33 +324,43 @@ end subroutine MPP_UPDATE_NEST_FINE_2D_V_
 subroutine MPP_UPDATE_NEST_FINE_3D_V_(fieldx, fieldy, nest_domain, wbufferx, wbuffery, sbufferx, sbuffery, &
                                       ebufferx, ebuffery, nbufferx, nbuffery, nest_level, &
                                       flags, gridtype, complete, extra_halo, name, tile_count)
-    MPP_TYPE_,             intent(in)      :: fieldx(:,:,:), fieldy(:,:,:)
-    type(nest_domain_type), intent(inout)  :: nest_domain
-    MPP_TYPE_,             intent(inout)   :: wbufferx(:,:,:), wbuffery(:,:,:)
-    MPP_TYPE_,             intent(inout)   :: ebufferx(:,:,:), ebuffery(:,:,:)
-    MPP_TYPE_,             intent(inout)   :: sbufferx(:,:,:), sbuffery(:,:,:)
-    MPP_TYPE_,             intent(inout)   :: nbufferx(:,:,:), nbuffery(:,:,:)
-    integer,          intent(in)           :: nest_level
-    integer,          intent(in), optional :: flags
-    logical,          intent(in), optional :: complete
+    MPP_TYPE_,             intent(in)      :: fieldx(:,:,:), fieldy(:,:,:) !< field x and y components on the model grid
+    type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                          !! between fine and coarse grid.
+    MPP_TYPE_,             intent(inout)   :: wbufferx(:,:,:), wbuffery(:,:,:) !< west side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: ebufferx(:,:,:), ebuffery(:,:,:) !< east side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: sbufferx(:,:,:), sbuffery(:,:,:) !< south side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: nbufferx(:,:,:), nbuffery(:,:,:) !< north side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+    integer,          intent(in), optional :: flags !< Specify the direction of fine grid halo buffer to be filled.
+                                                    !! Default value is XUPDATE+YUPDATE.
+    logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+                                                       !! Default value is .true.
     integer,          intent(in), optional :: gridtype
-    integer,          intent(in), optional :: extra_halo
-    character(len=*), intent(in), optional :: name
-    integer,          intent(in), optional :: tile_count
+    integer,          intent(in), optional :: extra_halo !< extra halo for passing data
+                                                         !! from coarse grid to fine grid.
+                                                         !! Default is 0 and currently only support extra_halo = 0.
+    character(len=*), intent(in), optional :: name !< Name of the nest domain.
+    integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                         !! default is 1 and currently only support tile_count = 1.
 
    MPP_TYPE_        :: d_type
    type(nestSpec), pointer :: updatex=>NULL()
    type(nestSpec), pointer :: updatey=>NULL()
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: f_addrsx=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: f_addrsy=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: wb_addrsx=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: eb_addrsx=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: sb_addrsx=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: nb_addrsx=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: wb_addrsy=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: eb_addrsy=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: sb_addrsy=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: nb_addrsy=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: f_addrsx=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: f_addrsy=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: wb_addrsx=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: eb_addrsx=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: sb_addrsx=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: nb_addrsx=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: wb_addrsy=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: eb_addrsy=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: sb_addrsy=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: nb_addrsy=-9999
 
    character(len=3) :: text
    logical          :: is_complete, set_mismatch
@@ -491,19 +544,29 @@ end subroutine MPP_UPDATE_NEST_FINE_3D_V_
 subroutine MPP_UPDATE_NEST_FINE_4D_V_(fieldx, fieldy, nest_domain, wbufferx, wbuffery, sbufferx, sbuffery, &
                                       ebufferx, ebuffery, nbufferx, nbuffery, nest_level, &
                                       flags, gridtype, complete, extra_halo, name, tile_count)
-    MPP_TYPE_,             intent(in)      :: fieldx(:,:,:,:), fieldy(:,:,:,:)
+    MPP_TYPE_,             intent(in)      :: fieldx(:,:,:,:), fieldy(:,:,:,:) !< field x and y
+                                                                     !! components on the model grid
     type(nest_domain_type), intent(inout)  :: nest_domain
-    MPP_TYPE_,             intent(inout)   :: wbufferx(:,:,:,:), wbuffery(:,:,:,:)
-    MPP_TYPE_,             intent(inout)   :: ebufferx(:,:,:,:), ebuffery(:,:,:,:)
-    MPP_TYPE_,             intent(inout)   :: sbufferx(:,:,:,:), sbuffery(:,:,:,:)
-    MPP_TYPE_,             intent(inout)   :: nbufferx(:,:,:,:), nbuffery(:,:,:,:)
-    integer,          intent(in)           :: nest_level
-    integer,          intent(in), optional :: flags
-    logical,          intent(in), optional :: complete
+    MPP_TYPE_,             intent(inout)   :: wbufferx(:,:,:,:), wbuffery(:,:,:,:) !< west side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: ebufferx(:,:,:,:), ebuffery(:,:,:,:) !< east side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: sbufferx(:,:,:,:), sbuffery(:,:,:,:) !< south side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    MPP_TYPE_,             intent(inout)   :: nbufferx(:,:,:,:), nbuffery(:,:,:,:) !< north side buffer x and y  components
+                                                                 !! to be filled with data on coarse grid.
+    integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+    integer,          intent(in), optional :: flags !< Specify the direction of fine grid halo buffer to be filled.
+                                                    !! Default value is XUPDATE+YUPDATE.
+    logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+                                                       !! Default value is .true.
     integer,          intent(in), optional :: gridtype
-    integer,          intent(in), optional :: extra_halo
-    character(len=*), intent(in), optional :: name
-    integer,          intent(in), optional :: tile_count
+    integer,          intent(in), optional :: extra_halo !< extra halo for passing data
+                                                         !! from coarse grid to fine grid.
+                                                         !! Default is 0 and currently only support extra_halo = 0.
+    character(len=*), intent(in), optional :: name !< Name of the nest domain.
+    integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                         !! default is 1 and currently only support tile_count = 1.
 
     MPP_TYPE_ :: field3Dx(size(fieldx,1),size(fieldx,2),size(fieldx,3)*size(fieldx,4))
     MPP_TYPE_ :: field3Dy(size(fieldy,1),size(fieldy,2),size(fieldy,3)*size(fieldy,4))
@@ -545,14 +608,18 @@ end subroutine MPP_UPDATE_NEST_FINE_4D_V_
 #endif VECTOR_FIELD_
 
 subroutine MPP_UPDATE_NEST_COARSE_2D_(field_in, nest_domain, field_out, nest_level, complete, position, name, tile_count)
-      MPP_TYPE_,             intent(in)      :: field_in(:,:)
-      type(nest_domain_type), intent(inout)  :: nest_domain
-      MPP_TYPE_,             intent(inout)   :: field_out(:,:)
-      integer,          intent(in)           :: nest_level
-      logical,          intent(in), optional :: complete
-      integer,          intent(in), optional :: position
-      character(len=*), intent(in), optional :: name
-      integer,          intent(in), optional :: tile_count
+      MPP_TYPE_,             intent(in)      :: field_in(:,:) !< field on the model grid
+      type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                            !! between fine and coarse grid.
+      MPP_TYPE_,             intent(inout)   :: field_out(:,:) !< field_out to be filled with data on coarse grid
+      integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+      logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+                                                         !! Default value is .true.
+      integer,          intent(in), optional :: position !< Cell position. Its value should be CENTER, EAST, CORNER
+                                                         !! or NORTH. Default is CENTER.
+      character(len=*), intent(in), optional :: name !< Name of the nest domain optional argument
+      integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                           !! default is 1 and currently only support tile_count = 1.
 
       MPP_TYPE_ :: field3D_in(size(field_in,1),size(field_in,2),1)
       MPP_TYPE_ :: field3D_out(size(field_out,1),size(field_out,2),1)
@@ -571,19 +638,22 @@ end subroutine MPP_UPDATE_NEST_COARSE_2D_
 !--- field_in is the data on fine grid pelist to be passed to coarse grid pelist.
 !--- field_in and field_out are all on the coarse grid. field_in is remapped from fine grid to coarse grid.
 subroutine MPP_UPDATE_NEST_COARSE_3D_(field_in, nest_domain, field_out, nest_level, complete, position, name, tile_count)
-   MPP_TYPE_,             intent(in)      :: field_in(:,:,:)
-   type(nest_domain_type), intent(inout)  :: nest_domain
-   MPP_TYPE_,             intent(inout)   :: field_out(:,:,:)
-   integer,          intent(in)           :: nest_level
-   logical,          intent(in), optional :: complete
-   integer,          intent(in), optional :: position
-   character(len=*), intent(in), optional :: name
-   integer,          intent(in), optional :: tile_count
+   MPP_TYPE_,             intent(in)      :: field_in(:,:,:) !< field on the model grid
+   type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                            !! between fine and coarse grid.
+   MPP_TYPE_,             intent(inout)   :: field_out(:,:,:) !< field_out to be filled with data on coarse grid
+   integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+   logical,          intent(in), optional :: complete !< When .true., do the buffer filling. Default value is .true.
+   integer,          intent(in), optional :: position !< Cell position. Its value should be CENTER, EAST, CORNER,
+                                                      !! or NORTH. Default is CENTER.
+   character(len=*), intent(in), optional :: name !< Name of the nest domain optional argument
+   integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                           !! default is 1 and currently only support tile_count = 1.
 
    MPP_TYPE_        :: d_type
    type(nestSpec), pointer :: update=>NULL()
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: fin_addrs=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: fout_addrs=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: fin_addrs=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: fout_addrs=-9999
    character(len=3) :: text
    logical          :: is_complete, set_mismatch
    integer          :: tile
@@ -676,14 +746,17 @@ end subroutine MPP_UPDATE_NEST_COARSE_3D_
 
 !###############################################################################
 subroutine MPP_UPDATE_NEST_COARSE_4D_(field_in, nest_domain, field_out, nest_level, complete, position, name, tile_count)
-      MPP_TYPE_,             intent(in)      :: field_in(:,:,:,:)
-      type(nest_domain_type), intent(inout)  :: nest_domain
-      MPP_TYPE_,             intent(inout)   :: field_out(:,:,:,:)
-      integer,          intent(in)           :: nest_level
-      logical,          intent(in), optional :: complete
-      integer,          intent(in), optional :: position
-      character(len=*), intent(in), optional :: name
-      integer,          intent(in), optional :: tile_count
+      MPP_TYPE_,             intent(in)      :: field_in(:,:,:,:) !< field on the model grid
+      type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                            !! between fine and coarse grid.
+      MPP_TYPE_,             intent(inout)   :: field_out(:,:,:,:) !< field_out to be filled with data on coarse grid
+      integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+      logical,          intent(in), optional :: complete !< When .true., do the buffer filling. Default value is .true.
+      integer,          intent(in), optional :: position !< Cell position. Its value should be CENTER, EAST, CORNER,
+                                                      !! or NORTH. Default is CENTER.
+      character(len=*), intent(in), optional :: name !< Name of the nest domain optional argument
+      integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                           !! default is 1 and currently only support tile_count = 1.
 
       MPP_TYPE_ :: field3D_in(size(field_in,1),size(field_in,2),size(field_in,3)*size(field_in,4))
       MPP_TYPE_ :: field3D_out(size(field_out,1),size(field_out,2),size(field_out,3)*size(field_out,4))
@@ -704,16 +777,21 @@ end subroutine MPP_UPDATE_NEST_COARSE_4D_
 !--- field_in and field_out are all on the coarse grid. field_in is remapped from fine grid to coarse grid.
 subroutine MPP_UPDATE_NEST_COARSE_2D_V_(fieldx_in, fieldy_in, nest_domain, fieldx_out, fieldy_out, nest_level, &
                                         flags, gridtype, complete, name, tile_count)
-   MPP_TYPE_,             intent(in)      :: fieldx_in(:,:)
-   MPP_TYPE_,             intent(in)      :: fieldy_in(:,:)
-   type(nest_domain_type), intent(inout)  :: nest_domain
-   integer,          intent(in), optional :: flags, gridtype
-   MPP_TYPE_,             intent(inout)   :: fieldx_out(:,:)
-   MPP_TYPE_,             intent(inout)   :: fieldy_out(:,:)
-   integer,          intent(in)           :: nest_level
-   logical,          intent(in), optional :: complete
-   character(len=*), intent(in), optional :: name
-   integer,          intent(in), optional :: tile_count
+   MPP_TYPE_,             intent(in)      :: fieldx_in(:,:) !< x component of field on the model grid
+   MPP_TYPE_,             intent(in)      :: fieldy_in(:,:) !< y component of field on the model grid
+   type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                         !! between fine and coarse grid.
+   integer,          intent(in), optional :: flags, gridtype !< Specify the direction of fine grid halo buffer to be filled.
+                                                    !! Default value is XUPDATE+YUPDATE.
+   MPP_TYPE_,             intent(inout)   :: fieldx_out(:,:) !< x component of field_out to be
+                                                             !! filled with data on coarse grid
+   MPP_TYPE_,             intent(inout)   :: fieldy_out(:,:) !< y component of field_out to be
+                                                             !! filled with data on coarse grid
+   integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+   logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+   character(len=*), intent(in), optional :: name !< Name of the nest domain optional argument
+   integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                        !! default is 1 and currently only support tile_count = 1.
 
    MPP_TYPE_ :: field3Dx_in(size(fieldx_in,1),size(fieldx_in,2),1)
    MPP_TYPE_ :: field3Dy_in(size(fieldy_in,1),size(fieldy_in,2),1)
@@ -739,24 +817,29 @@ end subroutine MPP_UPDATE_NEST_COARSE_2D_V_
 !--- field_in and field_out are all on the coarse grid. field_in is remapped from fine grid to coarse grid.
 subroutine MPP_UPDATE_NEST_COARSE_3D_V_(fieldx_in, fieldy_in, nest_domain, fieldx_out, fieldy_out, nest_level, &
                                         flags, gridtype, complete, name, tile_count)
-   MPP_TYPE_,             intent(in)      :: fieldx_in(:,:,:)
-   MPP_TYPE_,             intent(in)      :: fieldy_in(:,:,:)
-   type(nest_domain_type), intent(inout)  :: nest_domain
-   integer,          intent(in), optional :: flags, gridtype
-   MPP_TYPE_,             intent(inout)   :: fieldx_out(:,:,:)
-   MPP_TYPE_,             intent(inout)   :: fieldy_out(:,:,:)
-   integer,          intent(in)           :: nest_level
-   logical,          intent(in), optional :: complete
-   character(len=*), intent(in), optional :: name
-   integer,          intent(in), optional :: tile_count
+   MPP_TYPE_,             intent(in)      :: fieldx_in(:,:,:) !< x component field on the model grid
+   MPP_TYPE_,             intent(in)      :: fieldy_in(:,:,:) !< y component of field on the model grid
+   type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                         !! between fine and coarse grid.
+   integer,          intent(in), optional :: flags, gridtype !< Specify the direction of fine grid halo buffer to be filled.
+                                                    !! Default value is XUPDATE+YUPDATE.
+   MPP_TYPE_,             intent(inout)   :: fieldx_out(:,:,:) !< x component of field_out to be
+                                                               !! filled with data on coarse grid
+   MPP_TYPE_,             intent(inout)   :: fieldy_out(:,:,:) !< y component of field_out to be
+                                                               !! filled with data on coarse grid
+   integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+   logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+   character(len=*), intent(in), optional :: name !< Name of the nest domain optional argument
+   integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                        !! default is 1 and currently only support tile_count = 1.
 
    MPP_TYPE_        :: d_type
    type(nestSpec), pointer :: updatex=>NULL()
    type(nestSpec), pointer :: updatey=>NULL()
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: fin_addrsx=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: fin_addrsy=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: fout_addrsx=-9999
-   integer(LONG_KIND),dimension(MAX_DOMAIN_FIELDS),save :: fout_addrsy=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: fin_addrsx=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: fin_addrsy=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: fout_addrsx=-9999
+   integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: fout_addrsy=-9999
    character(len=3) :: text
    logical          :: is_complete, set_mismatch
    integer          :: tile
@@ -917,16 +1000,21 @@ end subroutine MPP_UPDATE_NEST_COARSE_3D_V_
 
 subroutine MPP_UPDATE_NEST_COARSE_4D_V_(fieldx_in, fieldy_in, nest_domain, fieldx_out, fieldy_out, nest_level, &
                                         flags, gridtype, complete, name, tile_count)
-   MPP_TYPE_,             intent(in)      :: fieldx_in(:,:,:,:)
-   MPP_TYPE_,             intent(in)      :: fieldy_in(:,:,:,:)
-   type(nest_domain_type), intent(inout)  :: nest_domain
-   integer,          intent(in), optional :: flags, gridtype
-   MPP_TYPE_,             intent(inout)   :: fieldx_out(:,:,:,:)
-   MPP_TYPE_,             intent(inout)   :: fieldy_out(:,:,:,:)
-   integer,          intent(in)           :: nest_level
-   logical,          intent(in), optional :: complete
-   character(len=*), intent(in), optional :: name
-   integer,          intent(in), optional :: tile_count
+   MPP_TYPE_,             intent(in)      :: fieldx_in(:,:,:,:) !< x component field on the model grid
+   MPP_TYPE_,             intent(in)      :: fieldy_in(:,:,:,:) !< y component field on the model grid
+   type(nest_domain_type), intent(inout)  :: nest_domain !< Holds the information to pass data
+                                                         !! between fine and coarse grid.
+   integer,          intent(in), optional :: flags, gridtype !< Specify the direction of fine grid halo buffer to be filled.
+                                                    !! Default value is XUPDATE+YUPDATE.
+   MPP_TYPE_,             intent(inout)   :: fieldx_out(:,:,:,:) !< x component of field_out to be
+                                                                 !! filled with data on coarse grid
+   MPP_TYPE_,             intent(inout)   :: fieldy_out(:,:,:,:) !< y component of field_out to be
+                                                                 !! filled with data on coarse grid
+   integer,          intent(in)           :: nest_level !< level of the nest (> 1 implies a telescoping nest)
+   logical,          intent(in), optional :: complete !< When .true., do the buffer filling.
+   character(len=*), intent(in), optional :: name !< Name of the nest domain optional argument
+   integer,          intent(in), optional :: tile_count !< Used to support multiple-tile-per-pe.
+                                                        !! default is 1 and currently only support tile_count = 1.
 
    MPP_TYPE_ :: field3Dx_in(size(fieldx_in,1),size(fieldx_in,2),size(fieldx_in,3)*size(fieldx_in,4))
    MPP_TYPE_ :: field3Dy_in(size(fieldy_in,1),size(fieldy_in,2),size(fieldy_in,3)*size(fieldy_in,4))
diff --git a/mpp/include/mpp_util.inc b/mpp/include/mpp_util.inc
index bd58fd6527..4d5a610264 100644
--- a/mpp/include/mpp_util.inc
+++ b/mpp/include/mpp_util.inc
@@ -853,7 +853,7 @@ end function rarray_to_char
   !#####################################################################
   subroutine mpp_clock_end(id)
     integer, intent(in) :: id
-    integer(LONG_KIND)  :: delta
+    integer(i8_kind)  :: delta
     integer             :: errunit
 
     if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_CLOCK_END: You must first call mpp_init.' )
@@ -905,7 +905,7 @@ end function rarray_to_char
     integer,           intent(in) :: event_id
     integer, intent(in), optional :: bytes
     integer                       :: n
-    integer(LONG_KIND)            :: delta
+    integer(i8_kind)            :: delta
     integer                       :: errunit
 
     if( .not. mpp_record_timing_data )return
diff --git a/mpp/include/mpp_write_2Ddecomp.h b/mpp/include/mpp_write_2Ddecomp.h
index f45dddc9af..035b008cea 100644
--- a/mpp/include/mpp_write_2Ddecomp.h
+++ b/mpp/include/mpp_write_2Ddecomp.h
@@ -40,14 +40,14 @@
       type(domain2D),    intent(in), optional :: domain
       integer,           intent(in), optional :: tile_count
       integer, dimension(size(field%axes(:))) :: start, axsiz
-      real(DOUBLE_KIND) :: time
+      real(r8_kind) :: time
       integer :: time_level
       logical :: newtime
       integer :: subdomain(4)
       integer :: packed_data(nwords)
       integer :: i, is, ie, js, je
 
-      real(FLOAT_KIND) :: data_r4(nwords)
+      real(r4_kind) :: data_r4(nwords)
       pointer( ptr1, data_r4)
       pointer( ptr2, packed_data)
 
@@ -129,10 +129,10 @@
 #ifdef use_netCDF
 !write time information if new time
           if( newtime )then
-              if( KIND(time).EQ.DOUBLE_KIND )then
+              if( KIND(time).EQ.r8_kind )then
                   error = NF_PUT_VAR1_DOUBLE( mpp_file(unit)%ncid, mpp_file(unit)%id, mpp_file(unit:unit)%time_level, time )
-              else if( KIND(time).EQ.FLOAT_KIND )then
-                  error = NF90_PUT_VAR ( mpp_file(unit)%ncid, mpp_file(unit)%id, time)
+              else if( KIND(time).EQ.r4_kind )then
+                 error = NF90_PUT_VAR ( mpp_file(unit)%ncid, mpp_file(unit)%id, time)
               end if
           end if
           if( field%pack == 0 )then
@@ -160,9 +160,27 @@
               write( unit,* )field%id, subdomain, time_level, time, data
           else                      !MPP_IEEE32 or MPP_NATIVE
               if( mpp_file(unit)%access.EQ.MPP_SEQUENTIAL )then
+#ifdef __sgi
+                  if( mpp_file(unit)%format.EQ.MPP_IEEE32 )then
+                      data_r4 = data !IEEE conversion layer on SGI until assign -N ieee_32 is supported
+                      write(unit)field%id, subdomain, time_level, time, data_r4
+                  else
+                      write(unit)field%id, subdomain, time_level, time, data
+                  end if
+#else
                   write(unit)field%id, subdomain, time_level, time, data
+#endif
               else                  !MPP_DIRECT
+#ifdef __sgi
+                  if( mpp_file(unit)%format.EQ.MPP_IEEE32 )then
+                      data_r4 = data !IEEE conversion layer on SGI until assign -N ieee_32 is supported
+                      write( unit, rec=mpp_file(unit)%record )field%id, subdomain, time_level, time, data_r4
+                  else
+                      write( unit, rec=mpp_file(unit)%record )field%id, subdomain, time_level, time, data
+                  end if
+#else
                   write( unit, rec=mpp_file(unit)%record )field%id, subdomain, time_level, time, data
+#endif
                   if( debug )print '(a,i6,a,i6)', 'MPP_WRITE: PE=', pe, ' wrote record ', mpp_file(unit)%record
               end if
           end if
@@ -427,3 +445,4 @@
 
       return
     end subroutine MPP_WRITE_2DDECOMP_4D_
+
diff --git a/mpp/include/system_clock.h b/mpp/include/system_clock.h
index 1759c15f38..88375dbeae 100644
--- a/mpp/include/system_clock.h
+++ b/mpp/include/system_clock.h
@@ -24,9 +24,9 @@ subroutine system_clock_mpi( count, count_rate, count_max )
 ! There can be one ONE baseline count0 and this routine is
 ! included in multiple places.
 !mimics F90 SYSTEM_CLOCK intrinsic
-      integer(LONG_KIND), intent(out), optional :: count, count_rate, count_max
+      integer(i8_kind), intent(out), optional :: count, count_rate, count_max
 !count must return a number between 0 and count_max
-      integer(LONG_KIND), parameter :: maxtick=HUGE(count_max)
+      integer(i8_kind), parameter :: maxtick=HUGE(count_max)
       if(first_call_system_clock_mpi)then
          first_call_system_clock_mpi=.false.
          mpi_count0 = MPI_WTime()
@@ -48,7 +48,7 @@ subroutine system_clock_mpi( count, count_rate, count_max )
 #define SYSTEM_CLOCK system_clock_default
 subroutine system_clock_default( count, count_rate, count_max )
 !mimics F90 SYSTEM_CLOCK intrinsic
-      integer(LONG_KIND), optional :: count, count_rate, count_max
+      integer(i8_kind), optional :: count, count_rate, count_max
 !count must return a number between 0 and count_max
       integer                      :: count_int, count_rate_int, count_max_int
       call system_clock( count_int, count_rate_int, count_max_int)
diff --git a/mpp/mpp.F90 b/mpp/mpp.F90
index 5b0a3edb37..60fe1a0c2c 100644
--- a/mpp/mpp.F90
+++ b/mpp/mpp.F90
@@ -159,7 +159,6 @@ module mpp_mod
 #define rank(X) size(shape(X))
 #endif
 
-#include <fms_platform.h>
 
 #if defined(use_libMPI)
   use mpi
@@ -186,6 +185,7 @@ module mpp_mod
   use mpp_data_mod,      only : stat, mpp_stack, ptr_stack, status, ptr_status, sync, ptr_sync
   use mpp_data_mod,      only : mpp_from_pe, ptr_from, remote_data_loc, ptr_remote
   use mpp_data_mod,      only : mpp_data_version=>version
+  use platform_mod
 
 implicit none
 private
@@ -243,7 +243,7 @@ module mpp_mod
   type :: event
      private
      character(len=16)                         :: name
-     integer(LONG_KIND), dimension(MAX_EVENTS) :: ticks, bytes
+     integer(i8_kind), dimension(MAX_EVENTS) :: ticks, bytes
      integer                                   :: calls
   end type event
 
@@ -251,8 +251,8 @@ module mpp_mod
   type :: clock
      private
      character(len=32)    :: name
-     integer(LONG_KIND)   :: tick
-     integer(LONG_KIND)   :: total_ticks
+     integer(i8_kind)   :: tick
+     integer(i8_kind)   :: total_ticks
      integer              :: peset_num
      logical              :: sync_on_begin, detailed
      integer              :: grain
@@ -264,12 +264,12 @@ module mpp_mod
   type :: Clock_Data_Summary
      private
      character(len=16)  :: name
-     real(DOUBLE_KIND)  :: msg_size_sums(MAX_BINS)
-     real(DOUBLE_KIND)  :: msg_time_sums(MAX_BINS)
-     real(DOUBLE_KIND)  :: total_data
-     real(DOUBLE_KIND)  :: total_time
-     integer(LONG_KIND) :: msg_size_cnts(MAX_BINS)
-     integer(LONG_KIND) :: total_cnts
+     real(r8_kind)  :: msg_size_sums(MAX_BINS)
+     real(r8_kind)  :: msg_time_sums(MAX_BINS)
+     real(r8_kind)  :: total_data
+     real(r8_kind)  :: total_time
+     integer(i8_kind) :: msg_size_cnts(MAX_BINS)
+     integer(i8_kind) :: total_cnts
   end type Clock_Data_Summary
 
   type :: Summary_Struct
@@ -540,14 +540,10 @@ module mpp_mod
   interface mpp_max
      module procedure mpp_max_real8_0d
      module procedure mpp_max_real8_1d
-#ifndef no_8byte_integers
      module procedure mpp_max_int8_0d
      module procedure mpp_max_int8_1d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_max_real4_0d
      module procedure mpp_max_real4_1d
-#endif
      module procedure mpp_max_int4_0d
      module procedure mpp_max_int4_1d
   end interface
@@ -555,14 +551,10 @@ module mpp_mod
   interface mpp_min
      module procedure mpp_min_real8_0d
      module procedure mpp_min_real8_1d
-#ifndef no_8byte_integers
      module procedure mpp_min_int8_0d
      module procedure mpp_min_int8_1d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_min_real4_0d
      module procedure mpp_min_real4_1d
-#endif
      module procedure mpp_min_int4_0d
      module procedure mpp_min_int4_1d
   end interface
@@ -602,14 +594,12 @@ module mpp_mod
   ! </INTERFACE>
 
   interface mpp_sum
-#ifndef no_8byte_integers
      module procedure mpp_sum_int8
      module procedure mpp_sum_int8_scalar
      module procedure mpp_sum_int8_2d
      module procedure mpp_sum_int8_3d
      module procedure mpp_sum_int8_4d
      module procedure mpp_sum_int8_5d
-#endif
      module procedure mpp_sum_real8
      module procedure mpp_sum_real8_scalar
      module procedure mpp_sum_real8_2d
@@ -630,14 +620,12 @@ module mpp_mod
      module procedure mpp_sum_int4_3d
      module procedure mpp_sum_int4_4d
      module procedure mpp_sum_int4_5d
-#ifdef OVERLOAD_R4
      module procedure mpp_sum_real4
      module procedure mpp_sum_real4_scalar
      module procedure mpp_sum_real4_2d
      module procedure mpp_sum_real4_3d
      module procedure mpp_sum_real4_4d
      module procedure mpp_sum_real4_5d
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_sum_cmplx4
      module procedure mpp_sum_cmplx4_scalar
@@ -649,14 +637,12 @@ module mpp_mod
   end interface
 
   interface mpp_sum_ad
-#ifndef no_8byte_integers
      module procedure mpp_sum_int8_ad
      module procedure mpp_sum_int8_scalar_ad
      module procedure mpp_sum_int8_2d_ad
      module procedure mpp_sum_int8_3d_ad
      module procedure mpp_sum_int8_4d_ad
      module procedure mpp_sum_int8_5d_ad
-#endif
      module procedure mpp_sum_real8_ad
      module procedure mpp_sum_real8_scalar_ad
      module procedure mpp_sum_real8_2d_ad
@@ -677,14 +663,12 @@ module mpp_mod
      module procedure mpp_sum_int4_3d_ad
      module procedure mpp_sum_int4_4d_ad
      module procedure mpp_sum_int4_5d_ad
-#ifdef OVERLOAD_R4
      module procedure mpp_sum_real4_ad
      module procedure mpp_sum_real4_scalar_ad
      module procedure mpp_sum_real4_2d_ad
      module procedure mpp_sum_real4_3d_ad
      module procedure mpp_sum_real4_4d_ad
      module procedure mpp_sum_real4_5d_ad
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_sum_cmplx4_ad
      module procedure mpp_sum_cmplx4_scalar_ad
@@ -859,7 +843,6 @@ module mpp_mod
      module procedure mpp_transmit_cmplx8_4d
      module procedure mpp_transmit_cmplx8_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_transmit_int8
      module procedure mpp_transmit_int8_scalar
      module procedure mpp_transmit_int8_2d
@@ -872,7 +855,6 @@ module mpp_mod
      module procedure mpp_transmit_logical8_3d
      module procedure mpp_transmit_logical8_4d
      module procedure mpp_transmit_logical8_5d
-#endif
 
      module procedure mpp_transmit_real4
      module procedure mpp_transmit_real4_scalar
@@ -917,7 +899,6 @@ module mpp_mod
      module procedure mpp_recv_cmplx8_4d
      module procedure mpp_recv_cmplx8_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_recv_int8
      module procedure mpp_recv_int8_scalar
      module procedure mpp_recv_int8_2d
@@ -930,7 +911,6 @@ module mpp_mod
      module procedure mpp_recv_logical8_3d
      module procedure mpp_recv_logical8_4d
      module procedure mpp_recv_logical8_5d
-#endif
 
      module procedure mpp_recv_real4
      module procedure mpp_recv_real4_scalar
@@ -975,7 +955,6 @@ module mpp_mod
      module procedure mpp_send_cmplx8_4d
      module procedure mpp_send_cmplx8_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_send_int8
      module procedure mpp_send_int8_scalar
      module procedure mpp_send_int8_2d
@@ -988,7 +967,6 @@ module mpp_mod
      module procedure mpp_send_logical8_3d
      module procedure mpp_send_logical8_4d
      module procedure mpp_send_logical8_5d
-#endif
 
      module procedure mpp_send_real4
      module procedure mpp_send_real4_scalar
@@ -1067,7 +1045,6 @@ module mpp_mod
      module procedure mpp_broadcast_cmplx8_4d
      module procedure mpp_broadcast_cmplx8_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_broadcast_int8
      module procedure mpp_broadcast_int8_scalar
      module procedure mpp_broadcast_int8_2d
@@ -1080,7 +1057,6 @@ module mpp_mod
      module procedure mpp_broadcast_logical8_3d
      module procedure mpp_broadcast_logical8_4d
      module procedure mpp_broadcast_logical8_5d
-#endif
 
      module procedure mpp_broadcast_real4
      module procedure mpp_broadcast_real4_scalar
@@ -1118,13 +1094,13 @@ module mpp_mod
   !     Parallel checksums.
   !   </OVERVIEW>
   !   <DESCRIPTION>
-  !     <TT>mpp_chksum</TT> is a parallel checksum routine that returns an
+  !     \empp_chksum is a parallel checksum routine that returns an
   !     identical answer for the same array irrespective of how it has been
-  !     partitioned across processors. <TT>LONG_KIND</TT>is the <TT>KIND</TT>
+  !     partitioned across processors. \eint_kind is the KIND
   !     parameter corresponding to long integers (see discussion on
   !     OS-dependent preprocessor directives) defined in
-  !     the header file <TT>fms_platform.h</TT>. <TT>MPP_TYPE_</TT> corresponds to any
-  !     4-byte and 8-byte variant of <TT>integer, real, complex, logical</TT>
+  !     the file platform.F90. \eMPP_TYPE_ corresponds to any
+  !     4-byte and 8-byte variant of \einteger, \ereal, \ecomplex, \elogical
   !     variables, of rank 0 to 5.
   !
   !     Integer checksums on FP data use the F90 <TT>TRANSFER()</TT>
@@ -1160,7 +1136,6 @@ module mpp_mod
   !   <IN NAME="var" TYPE="MPP_TYPE_"> </IN>
   ! </INTERFACE>
   interface mpp_chksum
-#ifndef no_8byte_integers
      module procedure mpp_chksum_i8_1d
      module procedure mpp_chksum_i8_2d
      module procedure mpp_chksum_i8_3d
@@ -1172,7 +1147,6 @@ module mpp_mod
      module procedure mpp_chksum_i8_4d_rmask
      module procedure mpp_chksum_i8_5d_rmask
 
-#endif
      module procedure mpp_chksum_i4_1d
      module procedure mpp_chksum_i4_2d
      module procedure mpp_chksum_i4_3d
@@ -1197,14 +1171,12 @@ module mpp_mod
      module procedure mpp_chksum_c8_4d
      module procedure mpp_chksum_c8_5d
 #endif
-#ifdef OVERLOAD_R4
      module procedure mpp_chksum_r4_0d
      module procedure mpp_chksum_r4_1d
      module procedure mpp_chksum_r4_2d
      module procedure mpp_chksum_r4_3d
      module procedure mpp_chksum_r4_4d
      module procedure mpp_chksum_r4_5d
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_chksum_c4_0d
      module procedure mpp_chksum_c4_1d
@@ -1226,11 +1198,11 @@ module mpp_mod
   logical              :: module_is_initialized = .false.
   logical              :: debug = .false.
   integer              :: npes=1, root_pe=0, pe=0
-  integer(LONG_KIND)   :: tick, ticks_per_sec, max_ticks, start_tick, end_tick, tick0=0
+  integer(i8_kind)   :: tick, ticks_per_sec, max_ticks, start_tick, end_tick, tick0=0
   integer              :: mpp_comm_private
   logical              :: first_call_system_clock_mpi=.TRUE.
-  real(DOUBLE_KIND)    :: mpi_count0=0  ! use to prevent integer overflow
-  real(DOUBLE_KIND)    :: mpi_tick_rate=0.d0  ! clock rate for mpi_wtick()
+  real(r8_kind)    :: mpi_count0=0  ! use to prevent integer overflow
+  real(r8_kind)    :: mpi_tick_rate=0.d0  ! clock rate for mpi_wtick()
   logical              :: mpp_record_timing_data=.TRUE.
   type(clock),save     :: clocks(MAX_CLOCKS)
   integer              :: log_unit, etc_unit
diff --git a/mpp/mpp_data.F90 b/mpp/mpp_data.F90
index a14211c1be..2cc4d26c70 100644
--- a/mpp/mpp_data.F90
+++ b/mpp/mpp_data.F90
@@ -17,13 +17,13 @@
 !* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
 !***********************************************************************
 module mpp_data_mod
-#include <fms_platform.h>
 
 #if defined(use_libMPI)
   use mpi
 #endif
 
   use mpp_parameter_mod, only : MAXPES
+  use platform_mod
 
   implicit none
   private
diff --git a/mpp/mpp_domains.F90 b/mpp/mpp_domains.F90
index dab642a1dd..dea0787220 100644
--- a/mpp/mpp_domains.F90
+++ b/mpp/mpp_domains.F90
@@ -17,105 +17,75 @@
 !* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
 !***********************************************************************
 !-----------------------------------------------------------------------
-!   Domain decomposition and domain update for message-passing codes
-!
-! AUTHOR: V. Balaji (vb@gfdl.gov)
-!         SGI/GFDL Princeton University
+!> @brief Domain decomposition and domain update for message-passing codes
+!> @author V. Balaji SGI/GFDL Princeton University
 !
 !-----------------------------------------------------------------------
-
-! <CONTACT EMAIL="V.Balaji@noaa.gov">
-!   V. Balaji
-! </CONTACT>
-! <CONTACT EMAIL="Zhi.Liang@noaa.gov">
-!   Zhi Liang
-! </CONTACT>
 ! <HISTORY SRC="http://www.gfdl.noaa.gov/fms-cgi-bin/cvsweb.cgi/FMS/"/>
 ! <RCSLOG SRC="http://www.gfdl.noaa.gov/~vb/changes_mpp_domains.html"/>
-
-! <OVERVIEW>
-!   <TT>mpp_domains_mod</TT> is a set of simple calls for domain
-!   decomposition and domain updates on rectilinear grids. It requires the
-!   module <LINK SRC="mpp.html">mpp_mod</LINK>, upon which it is built.
-! </OVERVIEW>
-
-! <DESCRIPTION>
-!   Scalable implementations of finite-difference codes are generally
-!   based on decomposing the model domain into subdomains that are
-!   distributed among processors. These domains will then be obliged to
-!   exchange data at their boundaries if data dependencies are merely
-!   nearest-neighbour, or may need to acquire information from the global
-!   domain if there are extended data dependencies, as in the spectral
-!   transform. The domain decomposition is a key operation in the
-!   development of parallel codes.
-!
-!   <TT>mpp_domains_mod</TT> provides a domain decomposition and domain
-!   update API for <I>rectilinear</I> grids, built on top of the <LINK
-!   SRC="mpp.html">mpp_mod</LINK> API for message passing. Features
-!   of <TT>mpp_domains_mod</TT> include:
+!> @detailed mpp_domains_mod is a set of simple calls for domain
+!!  decomposition and domain updates on rectilinear grids. It requires the
+!!  module mpp.F90, upon which it is built.
+!!  Scalable implementations of finite-difference codes are generally
+!!  based on decomposing the model domain into subdomains that are
+!!  distributed among processors. These domains will then be obliged to
+!!  exchange data at their boundaries if data dependencies are merely
+!!  nearest-neighbour, or may need to acquire information from the global
+!!  domain if there are extended data dependencies, as in the spectral
+!!  transform. The domain decomposition is a key operation in the
+!!  development of parallel codes.\n
+!!\n
+!! mpp_domains_mod provides a domain decomposition and domain
+!! update API for rectilinear grids, built on top of the mpp_mod API for message passing.
+!! Features of mpp_domains_mod include:\n
+!!\n
+!! Simple, minimal API, with free access to underlying API for more complicated stuff.\n
+!!\n
+!! Design toward typical use in climate/weather CFD codes.\n
+!!
+!> @par[Domains]
+!! It is assumed that domain decomposition will mainly be in 2
+!! horizontal dimensions, which will in general be the two
+!! fastest-varying indices. There is a separate implementation of 1D
+!! decomposition on the fastest-varying index, and 1D decomposition on
+!! the second index, treated as a special case of 2D decomposition, is
+!! also possible. We define domain as the grid associated with a <I>task</I>.
+!! We define the compute domain as the set of gridpoints that are
+!! computed by a task, and the data domain as the set of points
+!! that are required by the task for the calculation. There can in
+!! general be more than 1 task per PE, though often
+!! the number of domains is the same as the processor count. We define
+!! the global domain as the global computational domain of the
+!! entire model (i.e, the same as the computational domain if run on a
+!! single processor). 2D domains are defined using a derived type domain2D,
+!! constructed as follows (see comments in code for more details).
 !
-!   Simple, minimal API, with free access to underlying API for more complicated stuff.
+!> @example  type, public :: domain_axis_spec\n
+!!             private\n
+!!             integer :: begin, end, size, max_size\n
+!!             logical :: is_global\n
+!!           end type domain_axis_spec\n
+!> @example  type, public :: domain1D\n
+!!             private\n
+!!             type(domain_axis_spec) :: compute, data, global, active\n
+!!             logical :: mustputb, mustgetb, mustputf, mustgetf, folded\n
+!!             type(domain1D), pointer, dimension(:) :: list\n
+!!             integer :: pe  ! pe to which the domain is assigned\n
+!!             integer :: pos\n
+!!           end type domain1D
 !
-!   Design toward typical use in climate/weather CFD codes.
-!
-!   <H4>Domains</H4>
-!
-!   I have assumed that domain decomposition will mainly be in 2
-!   horizontal dimensions, which will in general be the two
-!   fastest-varying indices. There is a separate implementation of 1D
-!   decomposition on the fastest-varying index, and 1D decomposition on
-!   the second index, treated as a special case of 2D decomposition, is
-!   also possible. We define <I>domain</I> as the grid associated with a <I>task</I>.
-!   We define the <I>compute domain</I> as the set of gridpoints that are
-!   computed by a task, and the <I>data domain</I> as the set of points
-!   that are required by the task for the calculation. There can in
-!   general be more than 1 task per PE, though often
-!   the number of domains is the same as the processor count. We define
-!   the <I>global domain</I> as the global computational domain of the
-!   entire model (i.e, the same as the computational domain if run on a
-!   single processor). 2D domains are defined using a derived type <TT>domain2D</TT>,
-!   constructed as follows (see comments in code for more details):
-!
-!   <PRE>
-!     type, public :: domain_axis_spec
-!        private
-!        integer :: begin, end, size, max_size
-!        logical :: is_global
-!     end type domain_axis_spec
-!     type, public :: domain1D
-!        private
-!        type(domain_axis_spec) :: compute, data, global, active
-!        logical :: mustputb, mustgetb, mustputf, mustgetf, folded
-!        type(domain1D), pointer, dimension(:) :: list
-!        integer :: pe              !PE to which this domain is assigned
-!        integer :: pos
-!     end type domain1D
-!domaintypes of higher rank can be constructed from type domain1D
-!typically we only need 1 and 2D, but could need higher (e.g 3D LES)
-!some elements are repeated below if they are needed once per domain
-!     type, public :: domain2D
-!        private
-!        type(domain1D) :: x
-!        type(domain1D) :: y
-!        type(domain2D), pointer, dimension(:) :: list
-!        integer :: pe              !PE to which this domain is assigned
-!        integer :: pos
-!     end type domain2D
-!     type(domain1D), public :: NULL_DOMAIN1D
-!     type(domain2D), public :: NULL_DOMAIN2D
-!   </PRE>
-
-!   The <TT>domain2D</TT> type contains all the necessary information to
-!   define the global, compute and data domains of each task, as well as the PE
-!   associated with the task. The PEs from which remote data may be
-!   acquired to update the data domain are also contained in a linked list
-!   of neighbours.
-! </DESCRIPTION>
+!> @example  type, public :: domain2D\n
+!!              private\n
+!!              type(domain1D) :: x\n
+!!              type(domain1D) :: y\n
+!!              type(domain2D), pointer, dimension(:) :: list\n
+!!              integer :: pe ! PE to which this domain is assigned\n
+!!              integer :: pos\n
+!!            end type domain2D\n
+!!           type(domain1D), public :: NULL_DOMAIN1D\n
+!!           type(domain2D), public :: NULL_DOMAIN2D\n
 
 module mpp_domains_mod
-!a generalized domain decomposition package for use with mpp_mod
-!Balaji (vb@gfdl.gov) 15 March 1999
-#include <fms_platform.h>
 
 #if defined(use_libMPI)
   use mpi
@@ -150,6 +120,7 @@ module mpp_domains_mod
   use mpp_mod,                only : mpp_declare_pelist, mpp_set_current_pelist
   use mpp_memutils_mod,       only : mpp_memuse_begin, mpp_memuse_end
   use mpp_efp_mod,            only : mpp_reproducing_sum
+  use platform_mod
   implicit none
   private
 
@@ -243,7 +214,7 @@ module mpp_domains_mod
   integer, parameter :: FIELD_Y = 2
 
 
-  !--- data types used mpp_domains_mod.
+  ! data types used by mpp_domains_mod
   type unstruct_axis_spec
      private
      integer :: begin, end, size, max_size
@@ -289,13 +260,14 @@ module mpp_domains_mod
      integer :: tile_root_pe
      integer :: tile_npes
      integer :: npes_io_group
-     integer(INT_KIND) :: io_layout
+     integer(i4_kind) :: io_layout
   end type domainUG
 
-  type domain_axis_spec        !type used to specify index limits along an axis of a domain
+!> type used to specify index limits along an axis of a domain
+  type domain_axis_spec
      private
-     integer :: begin, end, size, max_size      !start, end of domain axis, size, max size in set
-     logical :: is_global       !TRUE if domain axis extent covers global domain
+     integer :: begin, end, size, max_size !< start, end of domain axis, size, max size in set
+     logical :: is_global !< .true. if domain axis extent covers global domain
   end type domain_axis_spec
 
   type domain1D
@@ -303,9 +275,9 @@ module mpp_domains_mod
      type(domain_axis_spec) :: compute, data, global, memory
      logical :: cyclic
      type(domain1D), pointer :: list(:) =>NULL()
-     integer :: pe               !PE to which this domain is assigned
-     integer :: pos              !position of this PE within link list, i.e domain%list(pos)%pe = pe
-     integer :: goffset, loffset !needed for global sum
+     integer :: pe !<PE to which this domain is assigned
+     integer :: pos !< position of this PE within link list, i.e domain%list(pos)%pe = pe
+     integer :: goffset, loffset !< needed for global sum
   end type domain1D
 
   type domain1D_spec
@@ -317,36 +289,36 @@ module mpp_domains_mod
 
   type domain2D_spec
      private
-     type(domain1D_spec), pointer :: x(:)       => NULL() ! x-direction domain decomposition
-     type(domain1D_spec), pointer :: y(:)       => NULL() ! x-direction domain decomposition
-     integer,        pointer :: tile_id(:) => NULL() ! tile id of each tile
-     integer                 :: pe                   ! PE to which this domain is assigned
-     integer                 :: pos                  ! position of this PE within link list
-     integer                 :: tile_root_pe         ! root pe of tile.
+     type(domain1D_spec), pointer :: x(:)       => NULL() !< x-direction domain decomposition
+     type(domain1D_spec), pointer :: y(:)       => NULL() !< x-direction domain decomposition
+     integer,        pointer :: tile_id(:) => NULL() !< tile id of each tile
+     integer                 :: pe                   !< PE to which this domain is assigned
+     integer                 :: pos                  !< position of this PE within link list
+     integer                 :: tile_root_pe         !< root pe of tile.
   end type domain2D_spec
 
   type overlap_type
      private
-     integer                  :: count = 0                 ! number of ovrelapping
+     integer                  :: count = 0                 !< number of ovrelapping
      integer                  :: pe
-     integer                  :: start_pos                 ! start position in the buffer
-     integer                  :: totsize                   ! all message size
-     integer ,        pointer :: msgsize(:)      => NULL() ! overlapping msgsize to be sent or received
-     integer,         pointer :: tileMe(:)       => NULL() ! my tile id for this overlap
-     integer,         pointer :: tileNbr(:)      => NULL() ! neighbor tile id for this overlap
-     integer,         pointer :: is(:)           => NULL() ! starting i-index
-     integer,         pointer :: ie(:)           => NULL() ! ending   i-index
-     integer,         pointer :: js(:)           => NULL() ! starting j-index
-     integer,         pointer :: je(:)           => NULL() ! ending   j-index
-     integer,         pointer :: dir(:)          => NULL() ! direction ( value 1,2,3,4 = E,S,W,N)
-     integer,         pointer :: rotation(:)     => NULL() ! rotation angle.
-     integer,         pointer :: index(:)        => NULL() ! for refinement
-     logical,         pointer :: from_contact(:) => NULL() ! indicate if the overlap is computed from define_contact_overlap
+     integer                  :: start_pos                 !< start position in the buffer
+     integer                  :: totsize                   !< all message size
+     integer ,        pointer :: msgsize(:)      => NULL() !< overlapping msgsize to be sent or received
+     integer,         pointer :: tileMe(:)       => NULL() !< my tile id for this overlap
+     integer,         pointer :: tileNbr(:)      => NULL() !< neighbor tile id for this overlap
+     integer,         pointer :: is(:)           => NULL() !< starting i-index
+     integer,         pointer :: ie(:)           => NULL() !< ending   i-index
+     integer,         pointer :: js(:)           => NULL() !< starting j-index
+     integer,         pointer :: je(:)           => NULL() !< ending   j-index
+     integer,         pointer :: dir(:)          => NULL() !< direction ( value 1,2,3,4 = E,S,W,N)
+     integer,         pointer :: rotation(:)     => NULL() !< rotation angle.
+     integer,         pointer :: index(:)        => NULL() !< for refinement
+     logical,         pointer :: from_contact(:) => NULL() !< indicate if the overlap is computed from define_contact_overlap
   end type overlap_type
 
   type overlapSpec
      private
-     integer                     :: whalo, ehalo, shalo, nhalo ! halo size
+     integer                     :: whalo, ehalo, shalo, nhalo !< halo size
      integer                     :: xbegin, xend, ybegin, yend
      integer                     :: nsend, nrecv
      integer                     :: sendsize, recvsize
@@ -359,60 +331,63 @@ module mpp_domains_mod
      integer :: xbegin, xend, ybegin, yend
   end type tile_type
 
-!domaintypes of higher rank can be constructed from type domain1D
-!typically we only need 1 and 2D, but could need higher (e.g 3D LES)
-!some elements are repeated below if they are needed once per domain, not once per axis
-
+!> @brief The domain2D type contains all the necessary information to
+!! define the global, compute and data domains of each task, as well as the PE
+!! associated with the task. The PEs from which remote data may be
+!! acquired to update the data domain are also contained in a linked list of neighbours.
+!> @detailed domain types of higher rank can be constructed from type domain1D
+!! typically we only need 1 and 2D, but could need higher (e.g 3D LES)
+!! some elements are repeated below if they are needed once per domain, not once per axis
   type domain2D
      private
-     character(len=NAME_LENGTH)  :: name='unnamed'          ! name of the domain, default is "unspecified"
-     integer(LONG_KIND)          :: id
-     integer                     :: pe                      ! PE to which this domain is assigned
+     character(len=NAME_LENGTH)  :: name='unnamed' !< name of the domain, default is "unspecified"
+     integer(i8_kind)            :: id
+     integer                     :: pe             !< PE to which this domain is assigned
      integer                     :: fold
-     integer                     :: pos                     ! position of this PE within link list
-     logical                     :: symmetry                ! indicate the domain is symmetric or non-symmetric.
-     integer                     :: whalo, ehalo            ! halo size in x-direction
-     integer                     :: shalo, nhalo            ! halo size in y-direction
-     integer                     :: ntiles                  ! number of tiles within mosaic
-     integer                     :: max_ntile_pe            ! maximum value in the pelist of number of tiles on each pe.
-     integer                     :: ncontacts               ! number of contact region within mosaic.
-     logical                     :: rotated_ninety          ! indicate if any contact rotate NINETY or MINUS_NINETY
-     logical                     :: initialized=.FALSE.     ! indicate if the overlapping is computed or not.
-     integer                     :: tile_root_pe            ! root pe of current tile.
-     integer                     :: io_layout(2)            ! io_layout, will be set through mpp_define_io_domain
-                                                            ! default = domain layout
-     integer,            pointer :: pearray(:,:)  => NULL() ! pe of each layout position
-     integer,            pointer :: tile_id(:)    => NULL() ! tile id of each tile on current processor
-     integer,            pointer :: tile_id_all(:)=> NULL() ! tile id of all the tiles of domain
-     type(domain1D),     pointer :: x(:)          => NULL() ! x-direction domain decomposition
-     type(domain1D),     pointer :: y(:)          => NULL() ! y-direction domain decomposition
-     type(domain2D_spec),pointer :: list(:)       => NULL() ! domain decomposition on pe list
-     type(tile_type),    pointer :: tileList(:)   => NULL() ! store tile information
-     type(overlapSpec),  pointer :: check_C       => NULL() ! send and recv information for boundary consistency check of C-cell
-     type(overlapSpec),  pointer :: check_E       => NULL() ! send and recv information for boundary consistency check of E-cell
-     type(overlapSpec),  pointer :: check_N       => NULL() ! send and recv information for boundary consistency check of N-cell
-     type(overlapSpec),  pointer :: bound_C       => NULL() ! send information for getting boundary value for symmetry domain.
-     type(overlapSpec),  pointer :: bound_E       => NULL() ! send information for getting boundary value for symmetry domain.
-     type(overlapSpec),  pointer :: bound_N       => NULL() ! send information for getting boundary value for symmetry domain.
-     type(overlapSpec),  pointer :: update_T      => NULL() ! send and recv information for halo update of T-cell.
-     type(overlapSpec),  pointer :: update_E      => NULL() ! send and recv information for halo update of E-cell.
-     type(overlapSpec),  pointer :: update_C      => NULL() ! send and recv information for halo update of C-cell.
-     type(overlapSpec),  pointer :: update_N      => NULL() ! send and recv information for halo update of N-cell.
-     type(domain2d),     pointer :: io_domain     => NULL() ! domain for IO, will be set through calling mpp_set_io_domain ( this will be changed).
+     integer                     :: pos            !< position of this PE within link list
+     logical                     :: symmetry       !< indicate the domain is symmetric or non-symmetric.
+     integer                     :: whalo, ehalo   !< halo size in x-direction
+     integer                     :: shalo, nhalo   !< halo size in y-direction
+     integer                     :: ntiles         !< number of tiles within mosaic
+     integer                     :: max_ntile_pe   !< maximum value in the pelist of number of tiles on each pe.
+     integer                     :: ncontacts     !< number of contact region within mosaic.
+     logical                     :: rotated_ninety  !< indicate if any contact rotate NINETY or MINUS_NINETY
+     logical                     :: initialized=.FALSE. !< indicate if the overlapping is computed or not.
+     integer                     :: tile_root_pe  !< root pe of current tile.
+     integer                     :: io_layout(2)  !< io_layout, will be set through mpp_define_io_domain
+                                                  !! default = domain layout
+     integer,            pointer :: pearray(:,:)  => NULL() !< pe of each layout position
+     integer,            pointer :: tile_id(:)    => NULL() !< tile id of each tile on current processor
+     integer,            pointer :: tile_id_all(:)=> NULL() !< tile id of all the tiles of domain
+     type(domain1D),     pointer :: x(:)          => NULL() !< x-direction domain decomposition
+     type(domain1D),     pointer :: y(:)          => NULL() !< y-direction domain decomposition
+     type(domain2D_spec),pointer :: list(:)       => NULL() !< domain decomposition on pe list
+     type(tile_type),    pointer :: tileList(:)   => NULL() !< store tile information
+     type(overlapSpec),  pointer :: check_C       => NULL() !< send and recv information for boundary consistency check of C-cell
+     type(overlapSpec),  pointer :: check_E       => NULL() !< send and recv information for boundary consistency check of E-cell
+     type(overlapSpec),  pointer :: check_N       => NULL() !< send and recv information for boundary consistency check of N-cell
+     type(overlapSpec),  pointer :: bound_C       => NULL() !< send information for getting boundary value for symmetry domain.
+     type(overlapSpec),  pointer :: bound_E       => NULL() !< send information for getting boundary value for symmetry domain.
+     type(overlapSpec),  pointer :: bound_N       => NULL() !< send information for getting boundary value for symmetry domain.
+     type(overlapSpec),  pointer :: update_T      => NULL() !< send and recv information for halo update of T-cell.
+     type(overlapSpec),  pointer :: update_E      => NULL() !< send and recv information for halo update of E-cell.
+     type(overlapSpec),  pointer :: update_C      => NULL() !< send and recv information for halo update of C-cell.
+     type(overlapSpec),  pointer :: update_N      => NULL() !< send and recv information for halo update of N-cell.
+     type(domain2d),     pointer :: io_domain     => NULL() !< domain for IO, will be set through calling mpp_set_io_domain ( this will be changed).
   end type domain2D
 
-  !--- the following type is used to reprsent the contact between tiles.
-  !--- this type will only be used in mpp_domains_define.inc
+  !> Type used to represent the contact between tiles.
+  !> @note This type will only be used in mpp_domains_define.inc
   type contact_type
      private
-     integer          :: ncontact                               ! number of neighbor tile.
-     integer, pointer :: tile(:) =>NULL()                      ! neighbor tile
-     integer, pointer :: align1(:)=>NULL(), align2(:)=>NULL()   ! alignment of me and neighbor
+     integer          :: ncontact                               !< number of neighbor tile.
+     integer, pointer :: tile(:) =>NULL()                       !< neighbor tile
+     integer, pointer :: align1(:)=>NULL(), align2(:)=>NULL()   !< alignment of me and neighbor
      real,    pointer :: refine1(:)=>NULL(), refine2(:)=>NULL() !
-     integer, pointer :: is1(:)=>NULL(), ie1(:)=>NULL()         ! i-index of current tile repsenting contact
-     integer, pointer :: js1(:)=>NULL(), je1(:)=>NULL()         ! j-index of current tile repsenting contact
-     integer, pointer :: is2(:)=>NULL(), ie2(:)=>NULL()         ! i-index of neighbor tile repsenting contact
-     integer, pointer :: js2(:)=>NULL(), je2(:)=>NULL()         ! j-index of neighbor tile repsenting contact
+     integer, pointer :: is1(:)=>NULL(), ie1(:)=>NULL()         !< i-index of current tile repsenting contact
+     integer, pointer :: js1(:)=>NULL(), je1(:)=>NULL()         !< j-index of current tile repsenting contact
+     integer, pointer :: is2(:)=>NULL(), ie2(:)=>NULL()         !< i-index of neighbor tile repsenting contact
+     integer, pointer :: js2(:)=>NULL(), je2(:)=>NULL()         !< j-index of neighbor tile repsenting contact
   end type contact_type
 
 
@@ -478,10 +453,10 @@ module mpp_domains_mod
   type DomainCommunicator2D
      private
      logical            :: initialized=.false.
-     integer(LONG_KIND) :: id=-9999
-     integer(LONG_KIND) :: l_addr  =-9999
-     integer(LONG_KIND) :: l_addrx =-9999
-     integer(LONG_KIND) :: l_addry =-9999
+     integer(i8_kind) :: id=-9999
+     integer(i8_kind) :: l_addr  =-9999
+     integer(i8_kind) :: l_addrx =-9999
+     integer(i8_kind) :: l_addry =-9999
      type(domain2D), pointer :: domain     =>NULL()
      type(domain2D), pointer :: domain_in  =>NULL()
      type(domain2D), pointer :: domain_out =>NULL()
@@ -512,13 +487,13 @@ module mpp_domains_mod
      integer, dimension(:)  , allocatable :: jsizeR
      integer, dimension(:,:), allocatable :: sendisR
      integer, dimension(:,:), allocatable :: sendjsR
-     integer(LONG_KIND), dimension(:), allocatable :: rem_addr
-     integer(LONG_KIND), dimension(:), allocatable :: rem_addrx
-     integer(LONG_KIND), dimension(:), allocatable :: rem_addry
-     integer(LONG_KIND), dimension(:,:), allocatable :: rem_addrl
-     integer(LONG_KIND), dimension(:,:), allocatable :: rem_addrlx
-     integer(LONG_KIND), dimension(:,:), allocatable :: rem_addrly
-     integer                             :: position        ! data location. T, E, C, or N.
+     integer(i8_kind), dimension(:), allocatable :: rem_addr
+     integer(i8_kind), dimension(:), allocatable :: rem_addrx
+     integer(i8_kind), dimension(:), allocatable :: rem_addry
+     integer(i8_kind), dimension(:,:), allocatable :: rem_addrl
+     integer(i8_kind), dimension(:,:), allocatable :: rem_addrlx
+     integer(i8_kind), dimension(:,:), allocatable :: rem_addrly
+     integer                             :: position !< data location. T, E, C, or N.
   end type DomainCommunicator2D
 
   integer, parameter :: MAX_REQUEST = 100
@@ -543,8 +518,8 @@ module mpp_domains_mod
      integer, dimension(MAX_REQUEST) :: type_recv
      integer, dimension(MAX_REQUEST) :: buffer_pos_send
      integer, dimension(MAX_REQUEST) :: buffer_pos_recv
-     integer(LONG_KIND)              :: field_addrs(MAX_DOMAIN_FIELDS)
-     integer(LONG_KIND)              :: field_addrs2(MAX_DOMAIN_FIELDS)
+     integer(i8_kind)              :: field_addrs(MAX_DOMAIN_FIELDS)
+     integer(i8_kind)              :: field_addrs2(MAX_DOMAIN_FIELDS)
      integer                         :: nfields
   end type nonblock_type
 
@@ -593,9 +568,9 @@ module mpp_domains_mod
      integer            :: unpack_ie(MAXOVERLAP)
      integer            :: unpack_js(MAXOVERLAP)
      integer            :: unpack_je(MAXOVERLAP)
-     integer(LONG_KIND) :: addrs_s(MAX_DOMAIN_FIELDS)
-     integer(LONG_KIND) :: addrs_x(MAX_DOMAIN_FIELDS)
-     integer(LONG_KIND) :: addrs_y(MAX_DOMAIN_FIELDS)
+     integer(i8_kind) :: addrs_s(MAX_DOMAIN_FIELDS)
+     integer(i8_kind) :: addrs_x(MAX_DOMAIN_FIELDS)
+     integer(i8_kind) :: addrs_y(MAX_DOMAIN_FIELDS)
      integer            :: buffer_start_pos = -1
      integer            :: request_send(MAX_REQUEST)
      integer            :: request_recv(MAX_REQUEST)
@@ -634,43 +609,45 @@ module mpp_domains_mod
   !-------- The following variables are used in mpp_domains_comm.h
 
   integer, parameter :: MAX_ADDRS=512
-  integer(LONG_KIND),dimension(MAX_ADDRS),save :: addrs_sorted=-9999  ! list of sorted local addrs
-  integer,           dimension(-1:MAX_ADDRS),save :: addrs_idx=-9999  ! idx of addr assoicated w/ d_comm
-  integer,           dimension(MAX_ADDRS),save :: a_salvage=-9999     ! freed idx list of addr
-  integer,                                save :: a_sort_len=0        ! len sorted memory list
-  integer,                                save :: n_addrs=0           ! num memory addresses used
+  integer(i8_kind),dimension(MAX_ADDRS),save :: addrs_sorted=-9999 !< list of sorted local addresses
+  integer,           dimension(-1:MAX_ADDRS),save :: addrs_idx=-9999 !< index of address associated with d_comm
+  integer,           dimension(MAX_ADDRS),save :: a_salvage=-9999 !< freed index list of addresses
+  integer,                                save :: a_sort_len=0 !< length sorted memory list
+  integer,                                save :: n_addrs=0   !< number of memory addresses used
 
-  integer(LONG_KIND), parameter :: ADDR2_BASE = int(Z'0000000000010000', kind=LONG_KIND)
+  integer(i8_kind), parameter :: ADDR2_BASE = int(Z'0000000000010000', kind=i8_kind)
   integer, parameter :: MAX_ADDRS2=128
-  integer(LONG_KIND),dimension(MAX_ADDRS2),save :: addrs2_sorted=-9999  ! list of sorted local addrs
-  integer,           dimension(-1:MAX_ADDRS2),save :: addrs2_idx=-9999  ! idx of addr2 assoicated w/ d_comm
-  integer,           dimension(MAX_ADDRS2),save :: a2_salvage=-9999     ! freed indices of addr2
-  integer,                                 save :: a2_sort_len=0        ! len sorted memory list
-  integer,                                 save :: n_addrs2=0           ! num memory addresses used
+  integer(i8_kind),dimension(MAX_ADDRS2),save :: addrs2_sorted=-9999 !< list of sorted local addresses
+  integer,           dimension(-1:MAX_ADDRS2),save :: addrs2_idx=-9999 !< index of addr2 associated with d_comm
+  integer,           dimension(MAX_ADDRS2),save :: a2_salvage=-9999 !< freed indices of addr2
+  integer,                                 save :: a2_sort_len=0   !< length sorted memory list
+  integer,                                 save :: n_addrs2=0  !< number of memory addresses used
 
   integer, parameter :: MAX_DOM_IDS=128
-  integer(LONG_KIND),dimension(MAX_DOM_IDS),save :: ids_sorted=-9999 ! list of sorted domain identifiers
-  integer,           dimension(-1:MAX_DOM_IDS),save :: ids_idx=-9999 ! idx of d_comm associated w/ sorted addr
-  integer,                                  save :: i_sort_len=0     ! len sorted domain ids list
-  integer,                                  save :: n_ids=0          ! num domain ids used (=i_sort_len; dom ids never removed)
+  integer(i8_kind),dimension(MAX_DOM_IDS),save :: ids_sorted=-9999 !< list of sorted domain identifiers
+  integer,           dimension(-1:MAX_DOM_IDS),save :: ids_idx=-9999 !< index of d_comm associated with sorted addesses
+  integer,                                  save :: i_sort_len=0 !< length sorted domain ids list
+  integer,                                  save :: n_ids=0 !< number of domain ids used
+                                                            !!(=i_sort_len; domain ids are never removed)
 
   integer, parameter :: MAX_FIELDS=1024
-  integer(LONG_KIND),        dimension(MAX_FIELDS),save           :: dcKey_sorted=-9999  ! list of sorted local addrs
+  integer(i8_kind),        dimension(MAX_FIELDS),save           :: dcKey_sorted=-9999 !< list of sorted local addresses
   !     Not sure why static d_comm fails during deallocation of derived type members; allocatable works
-  !     type(DomainCommunicator2D),dimension(MAX_FIELDS),save,target    :: d_comm              ! domain communicators
-  type(DomainCommunicator2D),dimension(:),allocatable,save,target :: d_comm              ! domain communicators
-  integer,                   dimension(-1:MAX_FIELDS),save           :: d_comm_idx=-9999 ! idx of d_comm associated w/ sorted addr
-  integer,                   dimension(MAX_FIELDS),save           :: dc_salvage=-9999    ! freed indices of d_comm
-  integer,                                         save           :: dc_sort_len=0       ! len sorted comm keys (=num active communicators)
-  integer,                                         save           :: n_comm=0            ! num communicators used
+  !     type(DomainCommunicator2D),dimension(MAX_FIELDS),save,target    :: d_comm !< domain communicators
+  type(DomainCommunicator2D),dimension(:),allocatable,save,target :: d_comm  !< domain communicators
+  integer,                   dimension(-1:MAX_FIELDS),save           :: d_comm_idx=-9999 !< index of d_comm associated with sorted addresses
+  integer,                   dimension(MAX_FIELDS),save           :: dc_salvage=-9999 !< freed indices of d_comm
+  integer,                                         save           :: dc_sort_len=0 !< length sorted comm keys
+!! (=num active communicators)
+  integer,                                         save           :: n_comm=0  !< number of communicators used
 
-  !     integer(LONG_KIND), parameter :: GT_BASE=2**8
-  integer(LONG_KIND), parameter :: GT_BASE = int(Z'0000000000000100', kind=LONG_KIND)
+  !     integer(i8_kind), parameter :: GT_BASE=2**8
+  integer(i8_kind), parameter :: GT_BASE = int(Z'0000000000000100', kind=i8_kind)
 
-  !     integer(LONG_KIND), parameter :: KE_BASE=2**48
-  integer(LONG_KIND), parameter :: KE_BASE = int(Z'0001000000000000', kind=LONG_KIND)
+  !     integer(i8_kind), parameter :: KE_BASE=2**48
+  integer(i8_kind), parameter :: KE_BASE = int(Z'0001000000000000', kind=i8_kind)
 
-  integer(LONG_KIND) :: domain_cnt=0
+  integer(i8_kind) :: domain_cnt=0
 
   !--- the following variables are used in mpp_domains_misc.h
   logical :: domain_clocks_on=.FALSE.
@@ -684,29 +661,25 @@ module mpp_domains_mod
   integer :: nonblock_group_recv_clock=0, nonblock_group_send_clock=0, nonblock_group_pack_clock=0
   integer :: nonblock_group_unpk_clock=0, nonblock_group_wait_clock=0
 
-  !--- namelist interface
-! <NAMELIST NAME="mpp_domains_nml">
-!   <DATA NAME="debug_update_domain" TYPE="character(len=32)"  DEFAULT="none">
-!     when debug_update_domain = none, no debug will be done. When debug_update_domain is set to fatal,
-!     the run will be exited with fatal error message. When debug_update_domain is set to
-!     warning, the run will output warning message. when debug update_domain is set to
-!     note, the run will output some note message. Will check the consistency on the boundary between
-!     processor/tile when updating doamin for symmetric domain and check the consistency on the north
-!     folded edge.
-!   </DATA>
-!   <DATA NAME="efp_sum_overflow_check" TYPE="logical" DEFAULT=".FALSE.">
-!     Set true to always do overflow_check when doing EFP bitwise mpp_global_sum.
-!   </DATA>
-!   <DATA NAME="nthread_control_loop" TYPE="integer"  DEFAULT="4">
-!     Determine the loop order for packing and unpacking. When number of threads is greater than nthread_control_loop,
-!     k-loop will be moved outside and combined with number of pack and unpack. When number of threads is less
-!     than or equal to nthread_control_loop, k-loop is moved inside but still outside of j,i loop.
-!   </DATA>
-! </NAMELIST>
-  character(len=32) :: debug_update_domain = "none"
-  logical           :: debug_message_passing = .false.
-  integer           :: nthread_control_loop = 8
-  logical           :: efp_sum_overflow_check = .false.
+!> namelist interface
+  character(len=32) :: debug_update_domain = "none" !< when debug_update_domain = none, no debug will be done.
+                                                    !! When debug_update_domain is set to fatal,
+                                                    !! the run will be exited with fatal error message
+                                                    !! When debug_update_domain is set to warning,
+                                                    !! the run will output warning message.
+                                                    !! When debug update_domain is set to note,
+                                                    !! the run will output some note message.
+  logical           :: debug_message_passing = .false. !<  Will check the consistency on the boundary between
+                                                       !! processor/tile when updating domain for symmetric domain and
+                                                       !! check the consistency on the north folded edge.
+  integer           :: nthread_control_loop = 8 !< Determine the loop order for packing and unpacking.
+                                                !! When number of threads is greater than nthread_control_loop,
+                                                !! the k-loop will be moved outside and combined with number
+                                                !! of pack and unpack. When the number of threads is
+                                                !! less than or equal to nthread_control_loop, the k-loop
+                                                !! is moved inside, but still outside, of j,i loop.
+  logical           :: efp_sum_overflow_check = .false. !< If .true., always do overflow_check
+                                                        !! when doing EFP bitwise mpp_global_sum.
   logical           :: use_alltoallw = .false.
   namelist /mpp_domains_nml/ debug_update_domain, domain_clocks_on, debug_message_passing, nthread_control_loop, &
                              efp_sum_overflow_check, use_alltoallw
@@ -720,54 +693,27 @@ module mpp_domains_mod
 !         public interface from mpp_domains_define.h
 !
 !***********************************************************************
-
-  ! <INTERFACE NAME="mpp_define_layout">
-  !  <OVERVIEW>
-  !    Retrieve layout associated with a domain decomposition.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    Given a global 2D domain and the number of divisions in the
-  !    decomposition (<TT>ndivs</TT>: usually the PE count unless some
-  !    domains are masked) this calls returns a 2D domain layout.
-  !
-  !    By default, <TT>mpp_define_layout</TT> will attempt to divide the
-  !    2D index space into domains that maintain the aspect ratio of the
-  !    global domain. If this cannot be done, the algorithm favours domains
-  !    that are longer in <TT>x</TT> than <TT>y</TT>, a preference that could
-  !    improve vector performance.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_define_layout( global_indices, ndivs, layout )
-  !  </TEMPLATE>
-  !  <IN NAME="global_indices"></IN>
-  !  <IN NAME="ndivs"></IN>
-  !  <OUT NAME="layout"></OUT>
-  ! </INTERFACE>
-
+  !> @brief Retrieve layout associated with a domain decomposition.
+  !> @detailed Given a global 2D domain and the number of divisions in the
+  !! decomposition ndivs (usually the PE count unless some
+  !! domains are \e masked) this calls returns a 2D domain layout.
+  !! By default, mpp_define_layout will attempt to divide the
+  !! 2D index space into domains that maintain the aspect ratio of the
+  !! global domain. If this cannot be done, the algorithm favours domains
+  !! that are longer in \e x than \e y, a preference that could improve vector performance.
+  !> @example call mpp_define_layout( global_indices, ndivs, layout )
   interface mpp_define_layout
      module procedure mpp_define_layout2D
   end interface
 
-
-  ! <INTERFACE NAME="mpp_define_domains">
-
-  !   <OVERVIEW>
-  !     Set up a domain decomposition.
-  !   </OVERVIEW>
-  !   <DESCRIPTION>
-  !     There are two forms for the <TT>mpp_define_domains</TT> call. The 2D
-  !     version is generally to be used but is built by repeated calls to the
-  !     1D version, also provided.
-  !   </DESCRIPTION>
-  !   <TEMPLATE>
-  !     call mpp_define_domains( global_indices, ndivs, domain, &
-  !                                   pelist, flags, halo, extent, maskmap )
-  !   </TEMPLATE>
-  !  <TEMPLATE>
-  !    call mpp_define_domains( global_indices, layout, domain, pelist, &
-  !                                   xflags, yflags, xhalo, yhalo,           &
-  !                                   xextent, yextent, maskmap, name )
-  !  </TEMPLATE>
+  !> @brief Set up a domain decomposition.
+  !> @detailed There are two forms for the \e mpp_define_domains call. The 2D version is generally
+  !! to be used but is built by repeated calls to the 1D version, also provided.
+  !> @example call mpp_define_domains( global_indices, ndivs, domain, &
+  !!                                   pelist, flags, halo, extent, maskmap )
+  !> @example call mpp_define_domains( global_indices, layout, domain, pelist, &
+  !!                                   xflags, yflags, xhalo, yhalo,           &
+  !!                                   xextent, yextent, maskmap, name )
   !   <IN NAME="global_indices" >
   !     Defines the global domain.
   !   </IN>
@@ -814,61 +760,46 @@ module mpp_domains_mod
   !  <IN NAME="xhalo, yhalo"></IN>
   !  <IN NAME="xextent, yextent"></IN>
   !  <IN NAME="name" ></IN>
-
-  !  <NOTE>
-  !    For example:
-  !
-  !    <PRE>
-  !    call mpp_define_domains( (/1,100/), 10, domain, &
-  !         flags=GLOBAL_DATA_DOMAIN+CYCLIC_GLOBAL_DOMAIN, halo=2 )
-  !    </PRE>
-  !
-  !    defines 10 compute domains spanning the range [1,100] of the global
-  !    domain. The compute domains are non-overlapping blocks of 10. All the data
-  !    domains are global, and with a halo of 2 span the range [-1:102]. And
-  !    since the global domain has been declared to be cyclic,
-  !    <TT>domain(9)%next => domain(0)</TT> and <TT>domain(0)%prev =>
-  !    domain(9)</TT>. A field is allocated on the data domain, and computations proceed on
-  !    the compute domain. A call to <LINK
-  !    SRC="#mpp_update_domains"><TT>mpp_update_domains</TT></LINK> would fill in
-  !    the values in the halo region:
-
-  !    <PRE>
-  !    call mpp_get_data_domain( domain, isd, ied ) !returns -1 and 102
-  !    call mpp_get_compute_domain( domain, is, ie ) !returns (1,10) on PE 0 ...
-  !    allocate( a(isd:ied) )
-  !    do i = is,ie
-  !       a(i) = &lt;perform computations&gt;
-  !    end do
-  !    call mpp_update_domains( a, domain )
-  !    </PRE>
-
-  !    The call to <TT>mpp_update_domains</TT> fills in the regions outside
-  !    the compute domain. Since the global domain is cyclic, the values at
-  !    <TT>i=(-1,0)</TT> are the same as at <TT>i=(99,100)</TT>; and
-  !    <TT>i=(101,102)</TT> are the same as <TT>i=(1,2)</TT>.
+  !> @example call mpp_define_domains( (/1,100/), 10, domain, &
+  !!         flags=GLOBAL_DATA_DOMAIN+CYCLIC_GLOBAL_DOMAIN, halo=2 )
+  !!
+  !! defines 10 compute domains spanning the range [1,100] of the global
+  !! domain. The compute domains are non-overlapping blocks of 10. All the data
+  !! domains are global, and with a halo of 2 span the range [-1:102]. And
+  !! since the global domain has been declared to be cyclic,
+  !! domain(9)%next => domain(0) and domain(0)%prev =>
+  !! domain(9). A field is allocated on the data domain, and computations proceed on
+  !! the compute domain. A call to mpp_update_domains would fill in the values
+  !! in the halo region:\n
+  !!\n
+  !! call mpp_get_data_domain( domain, isd, ied ) !returns -1 and 102
+  !! call mpp_get_compute_domain( domain, is, ie ) !returns (1,10) on PE 0 ...
+  !! allocate( a(isd:ied) )
+  !! do i = is,ie
+  !!   a(i) = &lt;perform computations&gt;
+  !! end do
+  !! call mpp_update_domains( a, domain )\n
+  !!\n
+  !! The call to mpp_update_domainsfills in the regions outside
+  !! the compute domain. Since the global domain is cyclic, the values at
+  !! \e i=(-1,0) are the same as at \e i=(99,100); and \e i=(101,102)
+  !! are the same as \e i=(1,2).
   !
-  !    The 2D version is just an extension of this syntax to two
-  !    dimensions.
-  !
-  !    The 2D version of the above should generally be used in
-  !    codes, including 1D-decomposed ones, if there is a possibility of
-  !    future evolution toward 2D decomposition. The arguments are similar to
-  !    the 1D case, except that now we have optional arguments
-  !    <TT>flags</TT>, <TT>halo</TT>, <TT>extent</TT> and <TT>maskmap</TT>
-  !    along two axes.
-  !
-  !    <TT>flags</TT> can now take an additional possible value to fold
-  !    one or more edges. This is done by using flags
-  !    <TT>FOLD_WEST_EDGE</TT>, <TT>FOLD_EAST_EDGE</TT>,
-  !    <TT>FOLD_SOUTH_EDGE</TT> or <TT>FOLD_NORTH_EDGE</TT>. When a fold
-  !    exists (e.g cylindrical domain), vector fields reverse sign upon
-  !    crossing the fold. This parity reversal is performed only in the
-  !    vector version of <LINK
-  !    SRC="#mpp_update_domains"><TT>mpp_update_domains</TT></LINK>. In
-  !    addition, shift operations may need to be applied to vector fields on
-  !    staggered grids, also described in the vector interface to
-  !    <TT>mpp_update_domains</TT>.
+  !> @example The 2D version is just an extension of this syntax to two dimensions.
+  !!
+  !! The 2D version of the above should generally be used in
+  !! codes, including 1D-decomposed ones, if there is a possibility of
+  !! future evolution toward 2D decomposition. The arguments are similar to
+  !! the 1D case, except that now we have optional arguments
+  !! flags, halo, extent and maskmap along two axes.
+  !!
+  !! flags can now take an additional possible value to fold one or more edges.
+  !! This is done by using flags \e FOLD_WEST_EDGE, \e FOLD_EAST_EDGE, \e FOLD_SOUTH_EDGE or
+  !! \e FOLD_NORTH_EDGE. When a fold exists (e.g cylindrical domain),
+  !! vector fields reverse sign upon
+  !! crossing the fold. This parity reversal is performed only in the vector version of
+  !! mpp_update_domains. In addition, shift operations may need to be applied to vector fields on
+  !! staggered grids, also described in the vector interface to mpp_update_domains.
   !
   !    <TT>name</TT> is the name associated with the decomposition,
   !    e.g <TT>'Ocean model'</TT>. If this argument is present,
@@ -877,44 +808,33 @@ module mpp_domains_mod
   !
   !    Examples:
   !
-  !    <PRE>
-  !    call mpp_define_domains( (/1,100,1,100/), (/2,2/), domain, xhalo=1 )
-  !    </PRE>
-  !
-  !    will create the following domain layout:
-  !    <PRE>
-  !                   |---------|-----------|-----------|-------------|
-  !                   |domain(1)|domain(2)  |domain(3)  |domain(4)    |
-  !    |--------------|---------|-----------|-----------|-------------|
-  !    |Compute domain|1,50,1,50|51,100,1,50|1,50,51,100|51,100,51,100|
-  !    |--------------|---------|-----------|-----------|-------------|
-  !    |Data domain   |0,51,1,50|50,101,1,50|0,51,51,100|50,101,51,100|
-  !    |--------------|---------|-----------|-----------|-------------|
-  !    </PRE>
-  !
-  !    Again, we allocate arrays on the data domain, perform computations
-  !    on the compute domain, and call <TT>mpp_update_domains</TT> to update
-  !    the halo region.
+  !> @example call mpp_define_domains( (/1,100,1,100/), (/2,2/), domain, xhalo=1 )
+  !! will create the following domain layout:\n
+  !!\n
+  !!                   |---------|-----------|-----------|-------------|
+  !!                   |domain(1)|domain(2)  |domain(3)  |domain(4)    |
+  !!    |--------------|---------|-----------|-----------|-------------|
+  !!    |Compute domain|1,50,1,50|51,100,1,50|1,50,51,100|51,100,51,100|
+  !!    |--------------|---------|-----------|-----------|-------------|
+  !!    |Data domain   |0,51,1,50|50,101,1,50|0,51,51,100|50,101,51,100|
+  !!    |--------------|---------|-----------|-----------|-------------|
+  !!
+  !! Again, we allocate arrays on the data domain, perform computations
+  !! on the compute domain, and call mpp_update_domains to update the halo region.
   !
-  !    If we wished to perfom a 1D decomposition along <TT>Y</TT>
-  !    on the same global domain, we could use:
-
-  !    <PRE>
-  !    call mpp_define_domains( (/1,100,1,100/), layout=(/4,1/), domain, xhalo=1 )
-  !    </PRE>
-
-  !    This will create the following domain layout:
-  !    <PRE>
-  !                   |----------|-----------|-----------|------------|
-  !                   |domain(1) |domain(2)  |domain(3)  |domain(4)   |
-  !    |--------------|----------|-----------|-----------|------------|
-  !    |Compute domain|1,100,1,25|1,100,26,50|1,100,51,75|1,100,76,100|
-  !    |--------------|----------|-----------|-----------|------------|
-  !    |Data domain   |0,101,1,25|0,101,26,50|0,101,51,75|1,101,76,100|
-  !    |--------------|----------|-----------|-----------|------------|
-  !    </PRE>
-  !   </NOTE>
-  ! </INTERFACE>
+  !> @example If we wished to perfom a 1D decomposition along Y on the same global domain,
+  !! we could use:
+  !! call mpp_define_domains( (/1,100,1,100/), layout=(/4,1/), domain, xhalo=1 )
+  !! This will create the following domain layout:\n
+  !!\n
+  !!                   |----------|-----------|-----------|------------|
+  !!                   |domain(1) |domain(2)  |domain(3)  |domain(4)   |
+  !!    |--------------|----------|-----------|-----------|------------|
+  !!    |Compute domain|1,100,1,25|1,100,26,50|1,100,51,75|1,100,76,100|
+  !!    |--------------|----------|-----------|-----------|------------|
+  !!    |Data domain   |0,101,1,25|0,101,26,50|0,101,51,75|1,101,76,100|
+  !!    |--------------|----------|-----------|-----------|------------|
+
   interface mpp_define_domains
      module procedure mpp_define_domains1D
      module procedure mpp_define_domains2D
@@ -935,43 +855,7 @@ module mpp_domains_mod
      module procedure mpp_deallocate_domain2D
   end interface
 
-! <INTERFACE NAME="mpp_modify_domain">
-!   <OVERVIEW>
-!     modifies the extents (compute, data and global) of domain
-!   </OVERVIEW>
-!   <IN NAME="domain_in">
-!     The source domain.
-!   </IN>
-!   <IN NAME="halo">
-!     Halo size of the returned 1D doamin. Default value is 0.
-!   </IN>
-!   <IN NAME="cbegin,cend">
-!    Axis specifications associated with the compute domain of the returned 1D domain.
-!   </IN>
-!   <IN NAME="gbegin,gend">
-!    Axis specifications associated with the global domain of the returned 1D domain.
-!   </IN>
-!   <IN NAME="isc,iec">
-!    Zonal axis specifications associated with the compute domain of the returned 2D domain.
-!   </IN>
-!   <IN NAME="jsc,jec">
-!    Meridinal axis specifications associated with the compute domain of the returned 2D domain.
-!   </IN>
-!   <IN NAME="isg,ieg">
-!    Zonal axis specifications associated with the global domain of the returned 2D domain.
-!   </IN>
-!   <IN NAME="jsg,jeg">
-!    Meridinal axis specifications associated with the global domain of the returned 2D domain.
-!   </IN>
-!   <IN NAME="xhalo,yhalo">
-!     Halo size of the returned 2D doamin. Default value is 0.
-!   </IN>
-!   <INOUT NAME="domain_out">
-!     The returned domain.
-!   </INOUT>
-
-! </INTERFACE>
-
+!> @brief modifies the extents (compute, data and global) of domain
   interface mpp_modify_domain
      module procedure mpp_modify_domain1D
      module procedure mpp_modify_domain2D
@@ -984,92 +868,75 @@ module mpp_domains_mod
 !
 !***********************************************************************
 
-! <INTERFACE NAME="mpp_update_domains">
-!  <OVERVIEW>
-!     Halo updates.
-!  </OVERVIEW>
-!  <DESCRIPTION>
-!    <TT>mpp_update_domains</TT> is used to perform a halo update of a
-!    domain-decomposed array on each PE. <TT>MPP_TYPE_</TT> can be of type
-!    <TT>complex</TT>, <TT>integer</TT>, <TT>logical</TT> or <TT>real</TT>;
-!    of 4-byte or 8-byte kind; of rank up to 5. The vector version (with
-!    two input data fields) is only present for <TT>real</TT> types.
-!
-!    For 2D domain updates, if there are halos present along both
-!    <TT>x</TT> and <TT>y</TT>, we can choose to update one only, by
-!    specifying <TT>flags=XUPDATE</TT> or <TT>flags=YUPDATE</TT>. In
-!    addition, one-sided updates can be performed by setting <TT>flags</TT>
-!    to any combination of <TT>WUPDATE</TT>, <TT>EUPDATE</TT>,
-!    <TT>SUPDATE</TT> and <TT>NUPDATE</TT>, to update the west, east, north
-!    and south halos respectively. Any combination of halos may be used by
-!    adding the requisite flags, e.g: <TT>flags=XUPDATE+SUPDATE</TT> or
-!    <TT>flags=EUPDATE+WUPDATE+SUPDATE</TT> will update the east, west and
-!    south halos.
-!
-!    If a call to <TT>mpp_update_domains</TT> involves at least one E-W
-!    halo and one N-S halo, the corners involved will also be updated, i.e,
-!    in the example above, the SE and SW corners will be updated.
-!
-!    If <TT>flags</TT> is not supplied, that is
-!    equivalent to <TT>flags=XUPDATE+YUPDATE</TT>.
-!
-!    The vector version is passed the <TT>x</TT> and <TT>y</TT>
-!    components of a vector field in tandem, and both are updated upon
-!    return. They are passed together to treat parity issues on various
-!    grids. For example, on a cubic sphere projection, the <TT>x</TT> and
-!    <TT>y</TT> components may be interchanged when passing from an
-!    equatorial cube face to a polar face. For grids with folds, vector
-!    components change sign on crossing the fold.  Paired scalar quantities
-!    can also be passed with the vector version if flags=SCALAR_PAIR, in which
-!    case components are appropriately interchanged, but signs are not.
-!
-!    Special treatment at boundaries such as folds is also required for
-!    staggered grids. The following types of staggered grids are
-!    recognized:
+!> @brief Halo updates.
+!> @detailed mpp_update_domains is used to perform a halo update of a
+!! domain-decomposed array on each PE. \e MPP_TYPE can be of type
+!! complex, integer, logical or real of 4-byte or 8-byte kind; of rank up to 5.
+!! The vector version (with two input data fields) is only present for real types.
+!! For 2D domain updates, if there are halos present along both
+!! x and y, we can choose to update one only, by specifying \e flags=XUPDATE or \e flags=YUPDATE.
+!! In addition, one-sided updates can be performed by setting flags
+!! to any combination of WUPDATE, EUPDATE, SUPDATE and NUPDATE
+!! to update the west, east, north and south halos respectively.
+!! Any combination of halos may be used by adding the requisite flags, e.g:
+!! \e flags=XUPDATE+SUPDATE or \e flags=EUPDATE+WUPDATE+SUPDATE will update the east,
+!! west and south halos.\n
+!!\n
+!! If a call to \e mpp_update_domains involves at least one E-W
+!! halo and one N-S halo, the corners involved will also be updated, i.e,
+!! in the example above, the SE and SW corners will be updated.\n
+!! If \e flags is not supplied, that is equivalent to \e flags=XUPDATE+YUPDATE.\n
+!!\n
+!! The vector version is passed the \e x and \e y components of a vector field in tandem,
+!! and both are updated upon return. They are passed together to treat parity issues on various
+!! grids. For example, on a cubic sphere projection, the \e x \e y components may be
+!! interchanged when passing from an equatorial cube face to a polar face.
+!! For grids with folds, vector components change sign on crossing the fold. Paired scalar
+!! quantities can also be passed with the vector version if \e flags=SCALAR_PAIR, in which
+!! case components are appropriately interchanged, but signs are not.\n
+!!\n
+!!    Special treatment at boundaries such as folds is also required for
+!!    staggered grids. The following types of staggered grids are
+!!    recognized:\n
+!!\n
+!!    1) AGRID: values are at grid centers.\n
+!!    2) BGRID_NE: vector fields are at the NE vertex of a grid
+!!    cell, i.e: the array elements \eu(i,j)and \ev(i,j)are
+!!    actually at (i,j;) with respect to the grid centers.\n
+!!    3) BGRID_SW: vector fields are at the SW vertex of a grid
+!!    cell, i.e: the array elements \eu(i,j) and \ev(i,j) are
+!!    actually at (i;,j;) with respect to the grid centers\n
+!!    4) CGRID_NE: vector fields are at the N and E faces of a
+!!    grid cell, i.e: the array elements \eu(i,j) and \ev(i,j)
+!!    are actually at (i;,j) and (i,j+&#189;) with respect to the
+!!    grid centers.\n
+!!    5) CGRID_SW: vector fields are at the S and W faces of a
+!!    grid cell, i.e: the array elements \eu(i,j)and \ev(i,j)
+!!    are actually at (i;,j) and (i,j;) with respect to the
+!!    grid centers.\n
+!!\n
+!!    The gridtypes listed above are all available by use association as
+!!   integer parameters. The scalar version of \empp_update_domains
+!!    assumes that the values of a scalar field are always at \eAGRID
+!!    locations, and no special boundary treatment is required. If vector
+!!    fields are at staggered locations, the optional argument
+!!    \egridtype must be appropriately set for correct treatment at
+!!    boundaries.
+!!\n
+!!    It is safe to apply vector field updates to the appropriate arrays
+!!    irrespective of the domain topology: if the topology requires no
+!!    special treatment of vector fields, specifying \egridtype will
+!!    do no harm.\n
+!!\n
+!!    \empp_update_domains internally buffers the date being sent
+!!    and received into single messages for efficiency. A turnable internal
+!!    buffer area in memory is provided for this purpose by
+!!    \empp_domains_mod. The size of this buffer area can be set by
+!!   the user by calling mpp_domains
+!!   \empp_domains_set_stack_size.
 !
-!    1) <TT>AGRID</TT>: values are at grid centers.<BR/>
-!    2) <TT>BGRID_NE</TT>: vector fields are at the NE vertex of a grid
-!    cell, i.e: the array elements <TT>u(i,j)</TT> and <TT>v(i,j)</TT> are
-!    actually at (i+&#189;,j+&#189;) with respect to the grid centers.<BR/>
-!    3) <TT>BGRID_SW</TT>: vector fields are at the SW vertex of a grid
-!    cell, i.e: the array elements <TT>u(i,j)</TT> and <TT>v(i,j)</TT> are
-!    actually at (i-&#189;,j-&#189;) with respect to the grid centers.<BR/>
-!    4) <TT>CGRID_NE</TT>: vector fields are at the N and E faces of a
-!    grid cell, i.e: the array elements <TT>u(i,j)</TT> and <TT>v(i,j)</TT>
-!    are actually at (i+&#189;,j) and (i,j+&#189;) with respect to the
-!    grid centers.<BR/>
-!    5) <TT>CGRID_SW</TT>: vector fields are at the S and W faces of a
-!    grid cell, i.e: the array elements <TT>u(i,j)</TT> and <TT>v(i,j)</TT>
-!    are actually at (i-&#189;,j) and (i,j-&#189;) with respect to the
-!    grid centers.
-!
-!    The gridtypes listed above are all available by use association as
-!    integer parameters. The scalar version of <TT>mpp_update_domains</TT>
-!    assumes that the values of a scalar field are always at <TT>AGRID</TT>
-!    locations, and no special boundary treatment is required. If vector
-!    fields are at staggered locations, the optional argument
-!    <TT>gridtype</TT> must be appropriately set for correct treatment at
-!    boundaries.
-!
-!    It is safe to apply vector field updates to the appropriate arrays
-!    irrespective of the domain topology: if the topology requires no
-!    special treatment of vector fields, specifying <TT>gridtype</TT> will
-!    do no harm.
-!
-!    <TT>mpp_update_domains</TT> internally buffers the date being sent
-!    and received into single messages for efficiency. A turnable internal
-!    buffer area in memory is provided for this purpose by
-!    <TT>mpp_domains_mod</TT>. The size of this buffer area can be set by
-!    the user by calling <LINK SRC="mpp_domains.html#mpp_domains_set_stack_size">
-!    <TT>mpp_domains_set_stack_size</TT></LINK>.
-!  </DESCRIPTION>
-!  <TEMPLATE>
-!    call mpp_update_domains( field, domain, flags )
-!  </TEMPLATE>
-!  <TEMPLATE>
-!    call mpp_update_domains( fieldx, fieldy, domain, flags, gridtype )
-!  </TEMPLATE>
-! </INTERFACE>
+!> @example call mpp_update_domains( field, domain, flags )
+!> @example call mpp_update_domains( fieldx, fieldy, domain, flags, gridtype )
   interface mpp_update_domains
      module procedure mpp_update_domain2D_r8_2d
      module procedure mpp_update_domain2D_r8_3d
@@ -1085,13 +952,10 @@ module mpp_domains_mod
      module procedure mpp_update_domain2D_c8_4d
      module procedure mpp_update_domain2D_c8_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_update_domain2D_i8_2d
      module procedure mpp_update_domain2D_i8_3d
      module procedure mpp_update_domain2D_i8_4d
      module procedure mpp_update_domain2D_i8_5d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_update_domain2D_r4_2d
      module procedure mpp_update_domain2D_r4_3d
      module procedure mpp_update_domain2D_r4_4d
@@ -1100,7 +964,6 @@ module mpp_domains_mod
      module procedure mpp_update_domain2D_r4_3dv
      module procedure mpp_update_domain2D_r4_4dv
      module procedure mpp_update_domain2D_r4_5dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_update_domain2D_c4_2d
      module procedure mpp_update_domain2D_c4_3d
@@ -1113,155 +976,147 @@ module mpp_domains_mod
      module procedure mpp_update_domain2D_i4_5d
   end interface
 
-! <INTERFACE NAME="mpp_start_update_domains/mpp_complete_update_domains">
-!  <OVERVIEW>
-!     Interface to start halo updates.
-!  </OVERVIEW>
-!  <DESCRIPTION>
-!    <TT>mpp_start_update_domains</TT> is used to start a halo update of a
-!    domain-decomposed array on each PE. <TT>MPP_TYPE_</TT> can be of type
-!    <TT>complex</TT>, <TT>integer</TT>, <TT>logical</TT> or <TT>real</TT>;
-!    of 4-byte or 8-byte kind; of rank up to 5. The vector version (with
-!    two input data fields) is only present for <TT>real</TT> types.
-!
-!    <TT>mpp_start_update_domains</TT> must be paired together with
-!    <TT>mpp_complete_update_domains</TT>. In <TT>mpp_start_update_domains</TT>,
-!    a buffer will be pre-post to receive (non-blocking) the
-!    data and data on computational domain will be packed and sent (non-blocking send)
-!    to other processor. In <TT>mpp_complete_update_domains</TT>, buffer will
-!    be unpacked to fill the halo and mpp_sync_self will be called to
-!    to ensure communication safe at the last call of mpp_complete_update_domains.
-!
-!    Each mpp_update_domains can be replaced by the combination of mpp_start_update_domains
-!    and mpp_complete_update_domains. The arguments in mpp_start_update_domains
-!    and mpp_complete_update_domains should be the exact the same as in
-!    mpp_update_domains to be replaced except no optional argument "complete".
-!    The following are examples on how to replace mpp_update_domains with
-!    mpp_start_update_domains/mpp_complete_update_domains
-!
-!    Example 1: Replace one scalar mpp_update_domains.
-!
-!    Replace
-!
-!        call mpp_update_domains(data, domain, flags=update_flags)
-!
-!    with
-!
-!        id_update = mpp_start_update_domains(data, domain, flags=update_flags)<BR/>
-!        ...( doing some computation )<BR/>
-!        call mpp_complete_update_domains(id_update, data, domain, flags=update_flags)<BR/>
-!<BR/>
-!    Example 2: Replace group scalar mpp_update_domains,
-!
-!    Replace
-!
-!        call mpp_update_domains(data_1, domain, flags=update_flags, complete=.false.)<BR/>
-!        .... ( other n-2 call mpp_update_domains with complete = .false. )<BR/>
-!        call mpp_update_domains(data_n, domain, flags=update_flags, complete=.true. )<BR/>
-!<BR/>
-!    With
-!
-!        id_up_1 = mpp_start_update_domains(data_1, domain, flags=update_flags)<BR/>
-!        .... ( other n-2 call mpp_start_update_domains )<BR/>
-!        id_up_n = mpp_start_update_domains(data_n, domain, flags=update_flags)<BR/>
-!
-!        ..... ( doing some computation )
-!
-!        call mpp_complete_update_domains(id_up_1, data_1, domain, flags=update_flags)<BR/>
-!        .... ( other n-2 call mpp_complete_update_domains  )<BR/>
-!        call mpp_complete_update_domains(id_up_n, data_n, domain, flags=update_flags)<BR/>
-!<BR/>
-!    Example 3: Replace group CGRID_NE vector, mpp_update_domains
-!
-!    Replace
+!> @brief Interface to start halo updates
+!> @detailed \empp_start_update_domains is used to start a halo update of a
+!!    domain-decomposed array on each PE. \eMPP_TYPE_ can be of type
+!!    \ecomplex, \einteger, \elogical or \ereal;
+!!    of 4-byte or 8-byte kind; of rank up to 5. The vector version (with
+!!    two input data fields) is only present for \ereal types.\n
+!!\n
+!!    \empp_start_update_domains must be paired together with
+!!    \empp_complete_update_domains. In \empp_start_update_domains,
+!!    a buffer will be pre-post to receive (non-blocking) the
+!!    data and data on computational domain will be packed and sent (non-blocking send)
+!!    to other processor. In \empp_complete_update_domains, buffer will
+!!    be unpacked to fill the halo and mpp_sync_self will be called to
+!!    to ensure communication safe at the last call of mpp_complete_update_domains.\n
+!!\n
+!!    Each mpp_update_domains can be replaced by the combination of mpp_start_update_domains
+!!    and mpp_complete_update_domains. The arguments in mpp_start_update_domains
+!!    and mpp_complete_update_domains should be the exact the same as in
+!!    mpp_update_domains to be replaced except no optional argument "complete".
+!!    The following are examples on how to replace mpp_update_domains with
+!!    mpp_start_update_domains/mpp_complete_update_domains
 !
-!        call mpp_update_domains(u_1, v_1, domain, flags=update_flgs, gridtype=CGRID_NE, complete=.false.)<BR/>
-!        .... ( other n-2 call mpp_update_domains with complete = .false. )<BR/>
-!        call mpp_update_domains(u_1, v_1, domain, flags=update_flags, gridtype=CGRID_NE, complete=.true. )<BR/>
-!<BR/>
-!    with
+!>@example Example 1: Replace one scalar mpp_update_domains.\n
+!!\n
+!!    Replace\n
+!!\n
+!!       call mpp_update_domains(data, domain, flags=update_flags)\n
+!!
+!!    with\n
+!!\n
+!!        id_update = mpp_start_update_domains(data, domain, flags=update_flags)\n
+!!        ...( doing some computation )\n
+!!        call mpp_complete_update_domains(id_update, data, domain, flags=update_flags)\n
 !
-!        id_up_1 = mpp_start_update_domains(u_1, v_1, domain, flags=update_flags, gridtype=CGRID_NE)<BR/>
-!        .... ( other n-2 call mpp_start_update_domains )<BR/>
-!        id_up_n = mpp_start_update_domains(u_n, v_n, domain, flags=update_flags, gridtype=CGRID_NE)<BR/>
-!<BR/>
-!        ..... ( doing some computation )
+!> @example Example 2: Replace group scalar mpp_update_domains\n
+!!\n
+!!    Replace\n
+!!\n
+!!       call mpp_update_domains(data_1, domain, flags=update_flags, complete=.false.)\n
+!!        .... ( other n-2 call mpp_update_domains with complete = .false. )\n
+!!        call mpp_update_domains(data_n, domain, flags=update_flags, complete=.true. )\n
+!!\n
+!!    With\n
+!!\n
+!!        id_up_1 = mpp_start_update_domains(data_1, domain, flags=update_flags)\n
+!!        .... ( other n-2 call mpp_start_update_domains )\n
+!!        id_up_n = mpp_start_update_domains(data_n, domain, flags=update_flags)\n
+!!\n
+!!        ..... ( doing some computation )\n
+!!\n
+!!        call mpp_complete_update_domains(id_up_1, data_1, domain, flags=update_flags)\n
+!!        .... ( other n-2 call mpp_complete_update_domains  )\n
+!!        call mpp_complete_update_domains(id_up_n, data_n, domain, flags=update_flags)\n
 !
-!        call mpp_complete_update_domains(id_up_1, u_1, v_1, domain, flags=update_flags, gridtype=CGRID_NE)<BR/>
-!        .... ( other n-2 call mpp_complete_update_domains  )<BR/>
-!        call mpp_complete_update_domains(id_up_n, u_n, v_n, domain, flags=update_flags, gridtype=CGRID_NE)<BR/>
-! <BR/>
-!    For 2D domain updates, if there are halos present along both
-!    <TT>x</TT> and <TT>y</TT>, we can choose to update one only, by
-!    specifying <TT>flags=XUPDATE</TT> or <TT>flags=YUPDATE</TT>. In
-!    addition, one-sided updates can be performed by setting <TT>flags</TT>
-!    to any combination of <TT>WUPDATE</TT>, <TT>EUPDATE</TT>,
-!    <TT>SUPDATE</TT> and <TT>NUPDATE</TT>, to update the west, east, north
-!    and south halos respectively. Any combination of halos may be used by
-!    adding the requisite flags, e.g: <TT>flags=XUPDATE+SUPDATE</TT> or
-!    <TT>flags=EUPDATE+WUPDATE+SUPDATE</TT> will update the east, west and
-!    south halos.
+!> @example    Example 3: Replace group CGRID_NE vector, mpp_update_domains\n
+!!\n
+!!    Replace\n
+!!\n
+!!        call mpp_update_domains(u_1, v_1, domain, flags=update_flgs, gridtype=CGRID_NE, complete=.false.)\n
+!!       .... ( other n-2 call mpp_update_domains with complete = .false. )\n
+!!        call mpp_update_domains(u_1, v_1, domain, flags=update_flags, gridtype=CGRID_NE, complete=.true. )\n
+!!\n
+!!    with\n
+!!\n
+!!        id_up_1 = mpp_start_update_domains(u_1, v_1, domain, flags=update_flags, gridtype=CGRID_NE)\n
+!!        .... ( other n-2 call mpp_start_update_domains )\n
+!!        id_up_n = mpp_start_update_domains(u_n, v_n, domain, flags=update_flags, gridtype=CGRID_NE)\n
+!!\n
+!!        ..... ( doing some computation )\n
+!!\n
+!!        call mpp_complete_update_domains(id_up_1, u_1, v_1, domain, flags=update_flags, gridtype=CGRID_NE)\n
+!!        .... ( other n-2 call mpp_complete_update_domains  )\n
+!!        call mpp_complete_update_domains(id_up_n, u_n, v_n, domain, flags=update_flags, gridtype=CGRID_NE)\n
+!!\n
+!!    For 2D domain updates, if there are halos present along both
+!!   \ex and \ey, we can choose to update one only, by
+!!    specifying \eflags=XUPDATE or \eflags=YUPDATE. In
+!!    addition, one-sided updates can be performed by setting \eflags
+!!    to any combination of \eWUPDATE, \eEUPDATE,
+!!    \eSUPDATE and \eNUPDATE, to update the west, east, north
+!!    and south halos respectively. Any combination of halos may be used by
+!!   adding the requisite flags, e.g: \eflags=XUPDATE+SUPDATE or
+!!    \eflags=EUPDATE+WUPDATE+SUPDATE will update the east, west and
+!!    south halos.\n
+!!\n
+!!   If a call to \empp_start_update_domains/mpp_complete_update_domains involves at least one E-W
+!!    halo and one N-S halo, the corners involved will also be updated, i.e,
+!!    in the example above, the SE and SW corners will be updated.\n
+!!\n
+!!    If \eflags is not supplied, that is
+!!    equivalent to \eflags=XUPDATE+YUPDATE.\n
+!!\n
+!!   The vector version is passed the \ex and \ey
+!!   components of a vector field in tandem, and both are updated upon
+!!    return. They are passed together to treat parity issues on various
+!!    grids. For example, on a cubic sphere projection, the \ex and
+!!    \ey components may be interchanged when passing from an
+!!    equatorial cube face to a polar face. For grids with folds, vector
+!!    components change sign on crossing the fold.  Paired scalar quantities
+!!    can also be passed with the vector version if flags=SCALAR_PAIR, in which
+!!    case components are appropriately interchanged, but signs are not.\n
+!!\n
+!!    Special treatment at boundaries such as folds is also required for
+!!    staggered grids. The following types of staggered grids are
+!!    recognized:
+!!\n
+!!    1) \eAGRID: values are at grid centers.\n
+!!    2) \eBGRID_NE: vector fields are at the NE vertex of a grid
+!!    cell, i.e: the array elements \eu(i,j) and \ev(i,j) are
+!!    actually at (i+&#189;,j+&#189;) with respect to the grid centers.\n
+!!    3) \eBGRID_SW: vector fields are at the SW vertex of a grid
+!!    cell, i.e., the array elements \eu(i,j) and \ev(i,j) are
+!!    actually at (i-&#189;,j-&#189;) with respect to the grid centers.\n
+!!    4) \eCGRID_NE: vector fields are at the N and E faces of a
+!!    grid cell, i.e: the array elements \eu(i,j) and \ev(i,j)
+!!    are actually at (i+&#189;,j) and (i,j+&#189;) with respect to the
+!!    grid centers.\n
+!!    5) \eCGRID_SW: vector fields are at the S and W faces of a
+!!    grid cell, i.e: the array elements \eu(i,j) and \ev(i,j)
+!!    are actually at (i-&#189;,j) and (i,j-&#189;) with respect to the
+!!    grid centers.\n
+!!\n
+!!    The gridtypes listed above are all available by use association as
+!!    integer parameters. If vector fields are at staggered locations, the
+!!    optional argument \egridtype must be appropriately set for
+!!    correct treatment at boundaries.
+!!\n
+!!    It is safe to apply vector field updates to the appropriate arrays
+!!    irrespective of the domain topology: if the topology requires no
+!!    special treatment of vector fields, specifying \egridtype will
+!!    do no harm.\n
+!!\n
+!!    \empp_start_update_domains/mpp_complete_update_domains internally
+!!    buffers the data being sent and received into single messages for efficiency.
+!!    A turnable internal buffer area in memory is provided for this purpose by
+!!    \empp_domains_mod. The size of this buffer area can be set by
+!!    the user by calling \empp_domains_set_stack_size.
 !
-!    If a call to <TT>mpp_start_update_domains/mpp_complete_update_domains</TT> involves at least one E-W
-!    halo and one N-S halo, the corners involved will also be updated, i.e,
-!    in the example above, the SE and SW corners will be updated.
-!
-!    If <TT>flags</TT> is not supplied, that is
-!    equivalent to <TT>flags=XUPDATE+YUPDATE</TT>.
-!
-!    The vector version is passed the <TT>x</TT> and <TT>y</TT>
-!    components of a vector field in tandem, and both are updated upon
-!    return. They are passed together to treat parity issues on various
-!    grids. For example, on a cubic sphere projection, the <TT>x</TT> and
-!    <TT>y</TT> components may be interchanged when passing from an
-!    equatorial cube face to a polar face. For grids with folds, vector
-!    components change sign on crossing the fold.  Paired scalar quantities
-!    can also be passed with the vector version if flags=SCALAR_PAIR, in which
-!    case components are appropriately interchanged, but signs are not.
-!
-!    Special treatment at boundaries such as folds is also required for
-!    staggered grids. The following types of staggered grids are
-!    recognized:
-!
-!    1) <TT>AGRID</TT>: values are at grid centers.<BR/>
-!    2) <TT>BGRID_NE</TT>: vector fields are at the NE vertex of a grid
-!    cell, i.e: the array elements <TT>u(i,j)</TT> and <TT>v(i,j)</TT> are
-!    actually at (i+&#189;,j+&#189;) with respect to the grid centers.<BR/>
-!    3) <TT>BGRID_SW</TT>: vector fields are at the SW vertex of a grid
-!    cell, i.e: the array elements <TT>u(i,j)</TT> and <TT>v(i,j)</TT> are
-!    actually at (i-&#189;,j-&#189;) with respect to the grid centers.<BR/>
-!    4) <TT>CGRID_NE</TT>: vector fields are at the N and E faces of a
-!    grid cell, i.e: the array elements <TT>u(i,j)</TT> and <TT>v(i,j)</TT>
-!    are actually at (i+&#189;,j) and (i,j+&#189;) with respect to the
-!    grid centers.<BR/>
-!    5) <TT>CGRID_SW</TT>: vector fields are at the S and W faces of a
-!    grid cell, i.e: the array elements <TT>u(i,j)</TT> and <TT>v(i,j)</TT>
-!    are actually at (i-&#189;,j) and (i,j-&#189;) with respect to the
-!    grid centers.
-!
-!    The gridtypes listed above are all available by use association as
-!    integer parameters. If vector fields are at staggered locations, the
-!    optional argument <TT>gridtype</TT> must be appropriately set for
-!    correct treatment at boundaries.
-!
-!    It is safe to apply vector field updates to the appropriate arrays
-!    irrespective of the domain topology: if the topology requires no
-!    special treatment of vector fields, specifying <TT>gridtype</TT> will
-!    do no harm.
-!
-!    <TT>mpp_start_update_domains/mpp_complete_update_domains</TT> internally
-!    buffers the data being sent and received into single messages for efficiency.
-!    A turnable internal buffer area in memory is provided for this purpose by
-!    <TT>mpp_domains_mod</TT>. The size of this buffer area can be set by
-!    the user by calling <LINK SRC="mpp_domains.html#mpp_domains_set_stack_size">
-!    <TT>mpp_domains_set_stack_size</TT></LINK>.
-!  </DESCRIPTION>
-!  <TEMPLATE>
-!    call mpp_start_update_domains( field, domain, flags )
-!    call mpp_complete_update_domains( field, domain, flags )
-!  </TEMPLATE>
+!> @example call mpp_start_update_domains( field, domain, flags )
+!> @example  call mpp_complete_update_domains( field, domain, flags )
 
-! </INTERFACE>
 
   interface mpp_start_update_domains
      module procedure mpp_start_update_domain2D_r8_2d
@@ -1278,13 +1133,10 @@ module mpp_domains_mod
      module procedure mpp_start_update_domain2D_c8_4d
      module procedure mpp_start_update_domain2D_c8_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_start_update_domain2D_i8_2d
      module procedure mpp_start_update_domain2D_i8_3d
      module procedure mpp_start_update_domain2D_i8_4d
      module procedure mpp_start_update_domain2D_i8_5d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_start_update_domain2D_r4_2d
      module procedure mpp_start_update_domain2D_r4_3d
      module procedure mpp_start_update_domain2D_r4_4d
@@ -1293,7 +1145,6 @@ module mpp_domains_mod
      module procedure mpp_start_update_domain2D_r4_3dv
      module procedure mpp_start_update_domain2D_r4_4dv
      module procedure mpp_start_update_domain2D_r4_5dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_start_update_domain2D_c4_2d
      module procedure mpp_start_update_domain2D_c4_3d
@@ -1321,13 +1172,10 @@ module mpp_domains_mod
      module procedure mpp_complete_update_domain2D_c8_4d
      module procedure mpp_complete_update_domain2D_c8_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_complete_update_domain2D_i8_2d
      module procedure mpp_complete_update_domain2D_i8_3d
      module procedure mpp_complete_update_domain2D_i8_4d
      module procedure mpp_complete_update_domain2D_i8_5d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_complete_update_domain2D_r4_2d
      module procedure mpp_complete_update_domain2D_r4_3d
      module procedure mpp_complete_update_domain2D_r4_4d
@@ -1336,7 +1184,6 @@ module mpp_domains_mod
      module procedure mpp_complete_update_domain2D_r4_3dv
      module procedure mpp_complete_update_domain2D_r4_4dv
      module procedure mpp_complete_update_domain2D_r4_5dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_complete_update_domain2D_c4_2d
      module procedure mpp_complete_update_domain2D_c4_3d
@@ -1355,13 +1202,9 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_start_do_update_c8_3d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_start_do_update_i8_3d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_start_do_update_r4_3d
      module procedure mpp_start_do_update_r4_3dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_start_do_update_c4_3d
 #endif
@@ -1374,13 +1217,9 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_complete_do_update_c8_3d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_complete_do_update_i8_3d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_complete_do_update_r4_3d
      module procedure mpp_complete_do_update_r4_3dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_complete_do_update_c4_3d
 #endif
@@ -1433,294 +1272,13 @@ module mpp_domains_mod
      module procedure mpp_reset_group_update_field_r8_4dv
   end interface mpp_reset_group_update_field
 
-  ! <INTERFACE NAME="mpp_define_nest_domains">
-  !   <OVERVIEW>
-  !     Set up a domain to pass data between aligned coarse and fine grid of nested
-  !     model.
-  !   </OVERVIEW>
-  !   <DESCRIPTION>
-  !     Set up a domain to pass data between aligned coarse and fine grid of a nested
-  !     model. Supports multiple and telescoping nests. A telescoping nest is defined as
-  !     a nest within a nest. Nest domains may span multiple tiles, but cannot contain a
-  !     coarse-grid, cube corner. Concurrent nesting is the only supported mechanism,
-  !     i.e. coarse and fine grid are on individual, non-overlapping, processor lists.
-  !     Coarse and fine grid domain need to be defined before calling mpp_define_nest_domains.
-  !     An mpp_broadcast is needed to broadcast both fine and coarse grid domain onto all processors.
-  !     <BR>
-  !     </BR>
-  !     <BR>
-  !     </BR>
-  !     mpp_update_nest_coarse is used to pass data from fine grid to coarse grid computing domain.
-  !     mpp_update_nest_fine   is used to pass data from coarse grid to fine grid halo.
-  !     You may call mpp_get_C2F_index before calling mpp_update_nest_fine to get the index for
-  !     passing data from coarse to fine. You may call mpp_get_F2C_index before calling
-  !     mpp_update_nest_coarse to get the index for passing data from coarse to fine.
-  !     <BR>
-  !     </BR>
-  !     <BR>
-  !     </BR>
-
-  !     NOTE: The following tests for nesting of regular lat-lon grids upon a cubed-sphere
-  !           grid are done in test_mpp_domains:
-  !              a) a first-level nest spanning multiple cubed-sphere faces (tiles 1, 2, & 4)
-  !              b) a first-level nest wholly contained within tile 3
-  !              c) a second-level nest contained within the nest mentioned in a)
-  !           Tests are done for data at T, E, C, N-cell center.
-  !
-  !     Below is an example to pass data between fine and coarse grid (More details on how to
-  !     use the nesting domain update are available in routine test_update_nest_domain of
-  !     test_fms/mpp/test_mpp_domains.F90.
-  !
-  !    <PRE>
-  !    if( concurrent ) then
-  !       call mpp_broadcast_domain(domain_fine)
-  !       call mpp_broadcast_domain(domain_coarse)
-  !    endif
-  !
-  !     call mpp_define_nest_domains (nest_domain, domain, num_nest, nest_level(1:num_nest),      &
-  !                                   tile_fine(1:num_nest), tile_coarse(1:num_nest),             &
-  !                                   istart_coarse(1:num_nest), icount_coarse(1:num_nest),       &
-  !                                   jstart_coarse(1:num_nest), jcount_coarse(1:num_nest),       &
-  !                                   npes_nest_tile, x_refine(1:num_nest), y_refine(1:num_nest), &
-  !                                   extra_halo=extra_halo, name="nest_domain")
-  !
-  !     call mpp_get_C2F_index(nest_domain, isw_f, iew_f, jsw_f, jew_f, isw_c, iew_c, jsw_c, jew_c, WEST, level)
-  !     call mpp_get_C2F_index(nest_domain, ise_f, iee_f, jse_f, jee_f, ise_c, iee_c, jse_c, jee_c, EAST, level)
-  !     call mpp_get_C2F_index(nest_domain, iss_f, ies_f, jss_f, jes_f, iss_c, ies_c, jss_c, jes_c, SOUTH, level)
-  !     call mpp_get_C2F_index(nest_domain, isn_f, ien_f, jsn_f, jen_f, isn_c, ien_c, jsn_c, jen_c, NORTH, level)
-  !
-  !     allocate(wbuffer(isw_c:iew_c, jsw_c:jew_c,nz))
-  !     allocate(ebuffer(ise_c:iee_c, jse_c:jee_c,nz))
-  !     allocate(sbuffer(iss_c:ies_c, jss_c:jes_c,nz))
-  !     allocate(nbuffer(isn_c:ien_c, jsn_c:jen_c,nz))
-  !     call mpp_update_nest_fine(x, nest_domain, wbuffer, sbuffer, ebuffer, nbuffer)
-  !
-  !     call mpp_get_F2C_index(nest_domain, is_c, ie_c, js_c, je_c, is_f, ie_f, js_f, je_f, nest_level=level)
-  !     allocate(buffer (is_f:ie_f, js_f:je_f,nz))
-  !     call mpp_update_nest_coarse(x, nest_domain, buffer)
-  !     </PRE>
-
-  !   </DESCRIPTION>
-  !   <TEMPLATE>
-  !     call mpp_define_nest_domains (nest_domain, domain, num_nest, nest_level, tile_fine, tile_coarse, &
-  !                                   istart_coarse, icount_coarse, jstart_coarse, jcount_coarse,        &
-  !                                   npes_nest_tile, x_refine, y_refine, extra_halo, name)
-  !   </TEMPLATE>
-  !
-  !   <INOUT NAME="nest_domain">
-  !     holds the information to pass data between nest and parent grids.
-  !   </INOUT>
-  !   <IN NAME="domain">
-  !     domain for the grid defined in the current pelist
-  !   </IN>
-  !   <IN NAME="num_nest">
-  !     number of nests
-  !   </IN>
-  !   <IN NAME="nest_level">
-  !     array containing the nest level for each nest (>1 implies a telescoping nest)
-  !   </IN>
-  !   <IN NAME="tile_fine">
-  !     array containing tile number of the nest grid (monotonically increasing starting with 7)
-  !   </IN>
-  !   <IN NAME="tile_coarse">
-  !     array containing tile number of the parent grid corresponding to the lower left corner of a given nest
-  !   </IN>
-  !   <IN NAME="istart_coarse, jstart_coarse">
-  !     array containing index in the parent grid of the lower left corner of a given nest
-  !   </IN>
-  !   <IN NAME="icount_coarse, jcount_coarse">
-  !     array containing span of the nest on the parent grid
-  !   </IN>
-  !   <IN NAME="npes_nest_tile">
-  !     array containing number of pes to allocated to each defined tile
-  !   </IN>
-  !   <IN NAME="x_refine, y_refine">
-  !     array containing refinement ratio for each nest
-  !   </IN>
-  !   <IN NAME="extra_halo">
-  !     extra halo for passing data from coarse grid to fine grid.
-  !     default is 0 and currently only support extra_halo = 0.
-  !     optional argument
-  !   </IN>
-  !   <IN NAME="name">
-  !     name of the nest domain
-  !     optional argument
-  !   </IN>
-  ! </INTERFACE>
-
-  ! <INTERFACE NAME="mpp_get_C2F_index">
-  !   <OVERVIEW>
-  !     Get the index of the data passed from coarse grid to fine grid.
-  !   </OVERVIEW>
-  !   <DESCRIPTION>
-  !     Get the index of the data passed from coarse grid to fine grid.
-  !   </DESCRIPTION>
-  !   <TEMPLATE>
-  !     call mpp_get_C2F_index(nest_domain, is_fine, ie_fine, js_fine, je_fine,
-  !                            is_coarse, ie_coarse, js_coarse, je_coarse, dir,
-  !                            nest_level, position)
-  !   </TEMPLATE>
-  !
-  !   <IN NAME="nest_domain">
-  !     holds the information to pass data between fine and coarse grid.
-  !   </IN>
-  !   <OUT NAME="istart_fine, iend_fine, jstart_fine, jend_fine">
-  !     index in the fine grid of the nested region
-  !   </OUT>
-  !   <OUT NAME="istart_coarse, iend_coarse, jstart_coarse, jend_coarse">
-  !     index in the coarse grid of the nested region
-  !   </OUT>
-  !   <IN NAME="dir">
-  !     direction of the halo update. Its value should be WEST, EAST, SOUTH or NORTH.
-  !   </IN>
-  !   <IN NAME="nest_level">
-  !     level of the nest (> 1 implies a telescoping nest)
-  !   </IN>
-  !   <IN NAME="position">
-  !     Cell position. It value should be CENTER, EAST, CORNER, or NORTH.
-  !     optional argument.
-  !   </IN>
-  ! </INTERFACE>
-
-  ! <INTERFACE NAME="mpp_get_F2C_index">
-  !   <OVERVIEW>
-  !     Get the index of the data passed from fine grid to coarse grid.
-  !   </OVERVIEW>
-  !   <DESCRIPTION>
-  !     Get the index of the data passed from fine grid to coarse grid.
-  !   </DESCRIPTION>
-  !   <TEMPLATE>
-  !     call mpp_get_F2C_index(nest_domain, is_coarse, ie_coarse, js_coarse, je_coarse,
-  !                            is_fine, ie_fine, js_fine, je_fine, nest_level, position)
-  !   </TEMPLATE>
-  !
-  !   <IN NAME="nest_domain">
-  !     Holds the information to pass data between fine and coarse grid.
-  !   </IN>
-  !   <OUT NAME="istart_fine, iend_fine, jstart_fine, jend_fine">
-  !     index in the fine grid of the nested region
-  !   </OUT>
-  !   <OUT NAME="istart_coarse, iend_coarse, jstart_coarse, jend_coarse">
-  !     index in the coarse grid of the nested region
-  !   </OUT>
-  !   <IN NAME="nest_level">
-  !     level of the nest (> 1 implies a telescoping nest)
-  !   </IN>
-  !   <IN NAME="position">
-  !     Cell position. It value should be CENTER, EAST, CORNER, or NORTH.
-  !   </IN>
-  ! </INTERFACE>
-
-  ! <INTERFACE NAME="mpp_update_nest_fine">
-  !   <OVERVIEW>
-  !     Pass the data from coarse grid to fill the buffer to be ready to be interpolated
-  !     onto fine grid.
-  !   </OVERVIEW>
-  !   <DESCRIPTION>
-  !     Pass the data from coarse grid to fill the buffer to be ready to be interpolated
-  !     onto fine grid.
-  !   </DESCRIPTION>
-  !   <TEMPLATE>
-  !     call mpp_update_nest_fine(field, nest_domain, wbuffer, ebuffer, sbuffer, nbuffer,
-  !                               nest_level, flags, complete, position, extra_halo, name,
-  !                               tile_count)
-  !   </TEMPLATE>
-  !
-  !   <IN    NAME="field">
-  !     field on the model grid.
-  !   </IN>
-  !   <INOUT NAME="nest_domain">
-  !     Holds the information to pass data between fine and coarse grid.
-  !   </INOUT>
-  !   <OUT   NAME="wbuffer">
-  !     west side buffer to be filled with data on coarse grid.
-  !   </OUT>
-  !   <OUT   NAME="ebuffer">
-  !     east side buffer to be filled with data on coarse grid.
-  !   </OUT>
-  !   <OUT   NAME="sbuffer">
-  !     south side buffer to be filled with data on coarse grid.
-  !   </OUT>
-  !   <OUT   NAME="nbuffer">
-  !     north side buffer to be filled with data on coarse grid.
-  !   </OUT>
-  !   <IN    NAME="nest_level">
-  !     level of the nest (> 1 implies a telescoping nest)
-  !   </IN>
-  !   <IN    NAME="flags">
-  !     Specify the direction of fine grid halo buffer to be filled.
-  !     Default value is XUPDATE+YUPDATE.
-  !     optional argument
-  !   </IN>
-  !   <IN    NAME="complete">
-  !     When true, do the buffer filling. Default value is true.
-  !     optional argument
-  !   </IN>
-  !   <IN    NAME="position">
-  !     Cell position. It value should be CENTER, EAST, CORNER, or NORTH. Default is CENTER.
-  !     optional argument
-  !   </IN>
-  !   <IN    NAME="extra_halo">
-  !     extra halo for passing data from coarse grid to fine grid.
-  !     Default is 0 and currently only support extra_halo = 0.
-  !     optional argument
-  !   </IN>
-  !   <IN    NAME="name">
-  !     Name of the nest domain.
-  !     optional argument
-  !   </IN>
-  !   <IN    NAME="tile_count">
-  !     Used to support multiple-tile-per-pe. default is 1 and currently
-  !     only support tile_count = 1.
-  !     optional argument
-  !   </IN>
-  ! </INTERFACE>
-
-  ! <INTERFACE NAME="mpp_update_nest_coarse">
-  !   <OVERVIEW>
-  !     Pass the data from fine grid to fill the buffer to be ready to be interpolated
-  !     onto coarse grid.
-  !   </OVERVIEW>
-  !   <DESCRIPTION>
-  !     Pass the data from fine grid to fill the buffer to be ready to be interpolated
-  !     onto coarse grid.
-  !   </DESCRIPTION>
-  !   <TEMPLATE>
-  !     call mpp_update_nest_coarse(field, nest_domain, field_out, nest_level, complete,
-  !                                 position, name, tile_count)
-  !   </TEMPLATE>
-  !
-  !   <IN    NAME="field">
-  !     field on the model grid.
-  !   </IN>
-  !   <INOUT NAME="nest_domain">
-  !     Holds the information to pass data between fine and coarse grid.
-  !   </INOUT>
-  !   <OUT   NAME="field_out">
-  !     field_out to be filled with data on coarse grid.
-  !   </OUT>
-  !   <IN    NAME="nest_level">
-  !     level of the nest (> 1 implies a telescoping nest)
-  !   </IN>
-  !   <IN    NAME="complete">
-  !     When true, do the buffer filling. Default value is true.
-  !     optional argument
-  !   </IN>
-  !   <IN    NAME="position">
-  !     Cell position. It value should be CENTER, EAST, CORNER, or NORTH. Default is CENTER.
-  !     optional argument
-  !   </IN>
-  !   <IN    NAME="name">
-  !     Name of the nest domain.
-  !     optional argument
-  !   </IN>
-  !   <IN    NAME="tile_count">
-  !     Used to support multiple-tile-per-pe. default is 1 and currently
-  !     only support tile_count = 1.
-  !     optional argument
-  !   </IN>
-  ! </INTERFACE>
-
+  !> @brief Pass the data from coarse grid to fill the buffer to be ready to be interpolated
+  !! nto fine grid.
+  !> @detailed Pass the data from coarse grid to fill the buffer to be ready to be interpolated
+  !! onto fine grid.
+  !> @example call mpp_update_nest_fine(field, nest_domain, wbuffer, ebuffer, sbuffer, nbuffer,
+  !!                           nest_level, flags, complete, position, extra_halo, name,
+  !!                           tile_count)
   interface mpp_update_nest_fine
      module procedure mpp_update_nest_fine_r8_2d
      module procedure mpp_update_nest_fine_r8_3d
@@ -1733,19 +1291,15 @@ module mpp_domains_mod
      module procedure mpp_update_nest_fine_c8_3d
      module procedure mpp_update_nest_fine_c8_4d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_update_nest_fine_i8_2d
      module procedure mpp_update_nest_fine_i8_3d
      module procedure mpp_update_nest_fine_i8_4d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_update_nest_fine_r4_2d
      module procedure mpp_update_nest_fine_r4_3d
      module procedure mpp_update_nest_fine_r4_4d
      module procedure mpp_update_nest_fine_r4_2dv
      module procedure mpp_update_nest_fine_r4_3dv
      module procedure mpp_update_nest_fine_r4_4dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_update_nest_fine_c4_2d
      module procedure mpp_update_nest_fine_c4_3d
@@ -1762,19 +1316,21 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_do_update_nest_fine_c8_3d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_do_update_nest_fine_i8_3d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_do_update_nest_fine_r4_3d
      module procedure mpp_do_update_nest_fine_r4_3dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_do_update_nest_fine_c4_3d
 #endif
      module procedure mpp_do_update_nest_fine_i4_3d
   end interface
 
+  !> @brief Pass the data from fine grid to fill the buffer to be ready to be interpolated
+  !! onto coarse grid.
+  !> @detailed Pass the data from fine grid to fill the buffer to be ready to be interpolated
+  !! onto coarse grid.
+  !> @example call mpp_update_nest_coarse(field, nest_domain, field_out, nest_level, complete,
+  !!                                 position, name, tile_count)
   interface mpp_update_nest_coarse
      module procedure mpp_update_nest_coarse_r8_2d
      module procedure mpp_update_nest_coarse_r8_3d
@@ -1787,19 +1343,15 @@ module mpp_domains_mod
      module procedure mpp_update_nest_coarse_c8_3d
      module procedure mpp_update_nest_coarse_c8_4d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_update_nest_coarse_i8_2d
      module procedure mpp_update_nest_coarse_i8_3d
      module procedure mpp_update_nest_coarse_i8_4d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_update_nest_coarse_r4_2d
      module procedure mpp_update_nest_coarse_r4_3d
      module procedure mpp_update_nest_coarse_r4_4d
      module procedure mpp_update_nest_coarse_r4_2dv
      module procedure mpp_update_nest_coarse_r4_3dv
      module procedure mpp_update_nest_coarse_r4_4dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_update_nest_coarse_c4_2d
      module procedure mpp_update_nest_coarse_c4_3d
@@ -1816,19 +1368,19 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_do_update_nest_coarse_c8_3d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_do_update_nest_coarse_i8_3d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_do_update_nest_coarse_r4_3d
      module procedure mpp_do_update_nest_coarse_r4_3dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_do_update_nest_coarse_c4_3d
 #endif
      module procedure mpp_do_update_nest_coarse_i4_3d
   end interface
 
+  !> @brief Get the index of the data passed from fine grid to coarse grid.
+  !> @detailed Get the index of the data passed from fine grid to coarse grid
+  !> @example call mpp_get_F2C_index(nest_domain, is_coarse, ie_coarse, js_coarse, je_coarse,
+  !!                            is_fine, ie_fine, js_fine, je_fine, nest_level, position)
   interface mpp_get_F2C_index
     module procedure mpp_get_F2C_index_fine
     module procedure mpp_get_F2C_index_coarse
@@ -1855,7 +1407,6 @@ module mpp_domains_mod
      module procedure mpp_update_domains_ad_2D_r8_3dv
      module procedure mpp_update_domains_ad_2D_r8_4dv
      module procedure mpp_update_domains_ad_2D_r8_5dv
-#ifdef OVERLOAD_R4
      module procedure mpp_update_domains_ad_2D_r4_2d
      module procedure mpp_update_domains_ad_2D_r4_3d
      module procedure mpp_update_domains_ad_2D_r4_4d
@@ -1864,7 +1415,6 @@ module mpp_domains_mod
      module procedure mpp_update_domains_ad_2D_r4_3dv
      module procedure mpp_update_domains_ad_2D_r4_4dv
      module procedure mpp_update_domains_ad_2D_r4_5dv
-#endif
   end interface
 !
 
@@ -1874,13 +1424,9 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_do_update_c8_3d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_do_update_i8_3d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_do_update_r4_3d
      module procedure mpp_do_update_r4_3dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_do_update_c4_3d
 #endif
@@ -1893,13 +1439,9 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_do_check_c8_3d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_do_check_i8_3d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_do_check_r4_3d
      module procedure mpp_do_check_r4_3dv
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_do_check_c4_3d
 #endif
@@ -1910,10 +1452,8 @@ module mpp_domains_mod
   interface mpp_pass_SG_to_UG
      module procedure mpp_pass_SG_to_UG_r8_2d
      module procedure mpp_pass_SG_to_UG_r8_3d
-#ifdef OVERLOAD_R4
      module procedure mpp_pass_SG_to_UG_r4_2d
      module procedure mpp_pass_SG_to_UG_r4_3d
-#endif
      module procedure mpp_pass_SG_to_UG_i4_2d
      module procedure mpp_pass_SG_to_UG_i4_3d
      module procedure mpp_pass_SG_to_UG_l4_2d
@@ -1923,10 +1463,8 @@ module mpp_domains_mod
   interface mpp_pass_UG_to_SG
      module procedure mpp_pass_UG_to_SG_r8_2d
      module procedure mpp_pass_UG_to_SG_r8_3d
-#ifdef OVERLOAD_R4
      module procedure mpp_pass_UG_to_SG_r4_2d
      module procedure mpp_pass_UG_to_SG_r4_3d
-#endif
      module procedure mpp_pass_UG_to_SG_i4_2d
      module procedure mpp_pass_UG_to_SG_i4_3d
      module procedure mpp_pass_UG_to_SG_l4_2d
@@ -1940,29 +1478,15 @@ module mpp_domains_mod
   interface mpp_do_update_ad
      module procedure mpp_do_update_ad_r8_3d
      module procedure mpp_do_update_ad_r8_3dv
-#ifdef OVERLOAD_R4
      module procedure mpp_do_update_ad_r4_3d
      module procedure mpp_do_update_ad_r4_3dv
-#endif
   end interface
 !
-
-! <INTERFACE NAME="mpp_get_boundary">
-! <OVERVIEW>
-!    Get the boundary data for symmetric domain when the data is at C, E, or N-cell center
-! </OVERVIEW>
-!  <DESCRIPTION>
-!    <TT>mpp_get_boundary</TT> is used to get the boundary data for symmetric domain
-!        when the data is at C, E, or N-cell center. For cubic grid, the data should
-!        always at C-cell center.
-!  </DESCRIPTION>
-!  <TEMPLATE>
-!    call mpp_get_boundary
-!  </TEMPLATE>
-!  <TEMPLATE>
-!    call mpp_get_boundary
-!  </TEMPLATE>
-! </INTERFACE>
+!> @brief Get the boundary data for symmetric domain when the data is at C, E, or N-cell center
+!> @deteiled \e mpp_get_boundary is used to get the boundary data for symmetric domain
+!! when the data is at C, E, or N-cell center. For cubic grid, the data should always
+!! at C-cell center.
+!> @example   call mpp_get_boundary
   interface mpp_get_boundary
      module procedure mpp_get_boundary_r8_2d
      module procedure mpp_get_boundary_r8_3d
@@ -1972,7 +1496,6 @@ module mpp_domains_mod
      module procedure mpp_get_boundary_r8_3dv
 !     module procedure mpp_get_boundary_r8_4dv
 !     module procedure mpp_get_boundary_r8_5dv
-#ifdef OVERLOAD_R4
      module procedure mpp_get_boundary_r4_2d
      module procedure mpp_get_boundary_r4_3d
 !     module procedure mpp_get_boundary_r4_4d
@@ -1981,7 +1504,6 @@ module mpp_domains_mod
      module procedure mpp_get_boundary_r4_3dv
 !     module procedure mpp_get_boundary_r4_4dv
 !     module procedure mpp_get_boundary_r4_5dv
-#endif
   end interface
 
   interface mpp_get_boundary_ad
@@ -1989,52 +1511,37 @@ module mpp_domains_mod
      module procedure mpp_get_boundary_ad_r8_3d
      module procedure mpp_get_boundary_ad_r8_2dv
      module procedure mpp_get_boundary_ad_r8_3dv
-#ifdef OVERLOAD_R4
      module procedure mpp_get_boundary_ad_r4_2d
      module procedure mpp_get_boundary_ad_r4_3d
      module procedure mpp_get_boundary_ad_r4_2dv
      module procedure mpp_get_boundary_ad_r4_3dv
-#endif
   end interface
 
   interface mpp_do_get_boundary
      module procedure mpp_do_get_boundary_r8_3d
      module procedure mpp_do_get_boundary_r8_3dv
-#ifdef OVERLOAD_R4
      module procedure mpp_do_get_boundary_r4_3d
      module procedure mpp_do_get_boundary_r4_3dv
-#endif
   end interface
 
   interface mpp_do_get_boundary_ad
      module procedure mpp_do_get_boundary_ad_r8_3d
      module procedure mpp_do_get_boundary_ad_r8_3dv
-#ifdef OVERLOAD_R4
      module procedure mpp_do_get_boundary_ad_r4_3d
      module procedure mpp_do_get_boundary_ad_r4_3dv
-#endif
   end interface
 
-! <INTERFACE NAME="mpp_redistribute">
-!  <OVERVIEW>
-!    Reorganization of distributed global arrays.
-!  </OVERVIEW>
-!  <DESCRIPTION>
-!    <TT>mpp_redistribute</TT> is used to reorganize a distributed
-!    array.  <TT>MPP_TYPE_</TT> can be of type <TT>integer</TT>,
-!    <TT>complex</TT>, or <TT>real</TT>; of 4-byte or 8-byte kind; of rank
-!    up to 5.
-!  </DESCRIPTION>
-!  <TEMPLATE>
-!    call mpp_redistribute( domain_in, field_in, domain_out, field_out )
-!  </TEMPLATE>
+!> @brief Reorganization of distributed global arrays.
+!> @detailed \e mpp_redistribute is used to reorganize a distributed array.
+!! \e MPP_TYPE_can be of type \e integer, \e complex, or \e real;
+!! of 4-byte or 8-byte kind; of rank up to 5.
+!> @example call mpp_redistribute( domain_in, field_in, domain_out, field_out )
 !  <IN NAME="field_in" TYPE="MPP_TYPE_">
 !    <TT>field_in</TT> is dimensioned on the data domain of <TT>domain_in</TT>.
 !  </IN>
 !  <OUT NAME="field_out" TYPE="MPP_TYPE_">
 !    <TT>field_out</TT> on the data domain of <TT>domain_out</TT>.
 !  </OUT>
-! </INTERFACE>
   interface mpp_redistribute
      module procedure mpp_redistribute_r8_2D
      module procedure mpp_redistribute_r8_3D
@@ -2046,7 +1553,6 @@ module mpp_domains_mod
      module procedure mpp_redistribute_c8_4D
      module procedure mpp_redistribute_c8_5D
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_redistribute_i8_2D
      module procedure mpp_redistribute_i8_3D
      module procedure mpp_redistribute_i8_4D
@@ -2055,13 +1561,10 @@ module mpp_domains_mod
 !!$     module procedure mpp_redistribute_l8_3D
 !!$     module procedure mpp_redistribute_l8_4D
 !!$     module procedure mpp_redistribute_l8_5D
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_redistribute_r4_2D
      module procedure mpp_redistribute_r4_3D
      module procedure mpp_redistribute_r4_4D
      module procedure mpp_redistribute_r4_5D
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_redistribute_c4_2D
      module procedure mpp_redistribute_c4_3D
@@ -2083,13 +1586,9 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_do_redistribute_c8_3D
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_do_redistribute_i8_3D
      module procedure mpp_do_redistribute_l8_3D
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_do_redistribute_r4_3D
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_do_redistribute_c4_3D
 #endif
@@ -2185,7 +1684,6 @@ module mpp_domains_mod
      module procedure mpp_global_field2D_c8_4d
      module procedure mpp_global_field2D_c8_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_global_field2D_i8_2d
      module procedure mpp_global_field2D_i8_3d
      module procedure mpp_global_field2D_i8_4d
@@ -2194,13 +1692,10 @@ module mpp_domains_mod
      module procedure mpp_global_field2D_l8_3d
      module procedure mpp_global_field2D_l8_4d
      module procedure mpp_global_field2D_l8_5d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_global_field2D_r4_2d
      module procedure mpp_global_field2D_r4_3d
      module procedure mpp_global_field2D_r4_4d
      module procedure mpp_global_field2D_r4_5d
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_global_field2D_c4_2d
      module procedure mpp_global_field2D_c4_3d
@@ -2228,7 +1723,6 @@ module mpp_domains_mod
      module procedure mpp_global_field2D_c8_4d_ad
      module procedure mpp_global_field2D_c8_5d_ad
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_global_field2D_i8_2d_ad
      module procedure mpp_global_field2D_i8_3d_ad
      module procedure mpp_global_field2D_i8_4d_ad
@@ -2237,13 +1731,10 @@ module mpp_domains_mod
      module procedure mpp_global_field2D_l8_3d_ad
      module procedure mpp_global_field2D_l8_4d_ad
      module procedure mpp_global_field2D_l8_5d_ad
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_global_field2D_r4_2d_ad
      module procedure mpp_global_field2D_r4_3d_ad
      module procedure mpp_global_field2D_r4_4d_ad
      module procedure mpp_global_field2D_r4_5d_ad
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_global_field2D_c4_2d_ad
      module procedure mpp_global_field2D_c4_3d_ad
@@ -2265,13 +1756,9 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_do_global_field2D_c8_3d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_do_global_field2D_i8_3d
      module procedure mpp_do_global_field2D_l8_3d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_do_global_field2D_r4_3d
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_do_global_field2D_c4_3d
 #endif
@@ -2284,13 +1771,9 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_do_global_field2D_a2a_c8_3d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_do_global_field2D_a2a_i8_3d
      module procedure mpp_do_global_field2D_a2a_l8_3d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_do_global_field2D_a2a_r4_3d
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_do_global_field2D_a2a_c4_3d
 #endif
@@ -2303,18 +1786,14 @@ module mpp_domains_mod
      module procedure mpp_global_field2D_ug_r8_3d
      module procedure mpp_global_field2D_ug_r8_4d
      module procedure mpp_global_field2D_ug_r8_5d
-#ifndef no_8byte_integers
      module procedure mpp_global_field2D_ug_i8_2d
      module procedure mpp_global_field2D_ug_i8_3d
      module procedure mpp_global_field2D_ug_i8_4d
      module procedure mpp_global_field2D_ug_i8_5d
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_global_field2D_ug_r4_2d
      module procedure mpp_global_field2D_ug_r4_3d
      module procedure mpp_global_field2D_ug_r4_4d
      module procedure mpp_global_field2D_ug_r4_5d
-#endif
      module procedure mpp_global_field2D_ug_i4_2d
      module procedure mpp_global_field2D_ug_i4_3d
      module procedure mpp_global_field2D_ug_i4_4d
@@ -2326,13 +1805,9 @@ module mpp_domains_mod
 #ifdef OVERLOAD_C8
      module procedure mpp_do_global_field2D_c8_3d_ad
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_do_global_field2D_i8_3d_ad
      module procedure mpp_do_global_field2D_l8_3d_ad
-#endif
-#ifdef OVERLOAD_R4
      module procedure mpp_do_global_field2D_r4_3d_ad
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_do_global_field2D_c4_3d_ad
 #endif
@@ -2340,26 +1815,17 @@ module mpp_domains_mod
      module procedure mpp_do_global_field2D_l4_3d_ad
   end interface
 
-! <INTERFACE NAME="mpp_global_max">
-!  <OVERVIEW>
-!    Global max/min of domain-decomposed arrays.
-!  </OVERVIEW>
-!  <DESCRIPTION>
-!    <TT>mpp_global_max</TT> is used to get the maximum value of a
-!    domain-decomposed array on each PE. <TT>MPP_TYPE_</TT> can be of type
-!    <TT>integer</TT> or <TT>real</TT>; of 4-byte or 8-byte kind; of rank
-!    up to 5. The dimension of <TT>locus</TT> must equal the rank of
-!    <TT>field</TT>.
+!> @example Global max/min of domain-decomposed arrays.
+!> @detailed \e mpp_global_max is used to get the maximum value of a
+!! domain-decomposed array on each PE. \e MPP_TYPE_can be of type
+!! \e integer or \e real; of 4-byte or 8-byte kind; of rank
+!! up to 5. The dimension of \e locus must equal the rank of \e field.\n
+!!\n
+!! All PEs in a domain decomposition must call \e mpp_global_max,
+!! and each will have the result upon exit.
+!! The function \e mpp_global_min, with an identical syntax. is also available.
 !
-!    All PEs in a domain decomposition must call
-!    <TT>mpp_global_max</TT>, and each will have the result upon exit.
-!
-!    The function <TT>mpp_global_min</TT>, with an identical syntax. is
-!    also available.
-!  </DESCRIPTION>
-!  <TEMPLATE>
-!    mpp_global_max( domain, field, locus )
-!  </TEMPLATE>
+!> @example mpp_global_max( domain, field, locus )
 !  <IN NAME="domain" TYPE="type(domain2D)"></IN>
 !  <IN NAME="field" TYPE="MPP_TYPE_">
 !    <TT>field</TT> is dimensioned on either the compute domain or the
@@ -2369,25 +1835,19 @@ module mpp_domains_mod
 !    <TT>locus</TT>, if present, can be used to retrieve the location of
 !    the maximum (as in the <TT>MAXLOC</TT> intrinsic of f90).
 !  </OUT>
-! </INTERFACE>
-
   interface mpp_global_max
      module procedure mpp_global_max_r8_2d
      module procedure mpp_global_max_r8_3d
      module procedure mpp_global_max_r8_4d
      module procedure mpp_global_max_r8_5d
-#ifdef OVERLOAD_R4
      module procedure mpp_global_max_r4_2d
      module procedure mpp_global_max_r4_3d
      module procedure mpp_global_max_r4_4d
      module procedure mpp_global_max_r4_5d
-#endif
-#ifndef no_8byte_integers
      module procedure mpp_global_max_i8_2d
      module procedure mpp_global_max_i8_3d
      module procedure mpp_global_max_i8_4d
      module procedure mpp_global_max_i8_5d
-#endif
      module procedure mpp_global_max_i4_2d
      module procedure mpp_global_max_i4_3d
      module procedure mpp_global_max_i4_4d
@@ -2399,37 +1859,25 @@ module mpp_domains_mod
      module procedure mpp_global_min_r8_3d
      module procedure mpp_global_min_r8_4d
      module procedure mpp_global_min_r8_5d
-#ifdef OVERLOAD_R4
      module procedure mpp_global_min_r4_2d
      module procedure mpp_global_min_r4_3d
      module procedure mpp_global_min_r4_4d
      module procedure mpp_global_min_r4_5d
-#endif
-#ifndef no_8byte_integers
      module procedure mpp_global_min_i8_2d
      module procedure mpp_global_min_i8_3d
      module procedure mpp_global_min_i8_4d
      module procedure mpp_global_min_i8_5d
-#endif
      module procedure mpp_global_min_i4_2d
      module procedure mpp_global_min_i4_3d
      module procedure mpp_global_min_i4_4d
      module procedure mpp_global_min_i4_5d
   end interface
 
-! <INTERFACE NAME="mpp_global_sum">
-!  <OVERVIEW>
-!    Global sum of domain-decomposed arrays.
-!  </OVERVIEW>
-!  <DESCRIPTION>
-!    <TT>mpp_global_sum</TT> is used to get the sum of a
-!    domain-decomposed array on each PE. <TT>MPP_TYPE_</TT> can be of type
-!    <TT>integer</TT>, <TT>complex</TT>, or <TT>real</TT>; of 4-byte or
-!    8-byte kind; of rank up to 5.
-!  </DESCRIPTION>
-!  <TEMPLATE>
-!    call mpp_global_sum( domain, field, flags )
-!  </TEMPLATE>
+!> @brief Global sum of domain-decomposed arrays.
+!> @detailed \e mpp_global_sum is used to get the sum of a domain-decomposed array
+!! on each PE. \e MPP_TYPE_ can be of type \e integer, \e complex, or \e real; of 4-byte or
+!! 8-byte kind; of rank up to 5.
+!> @example call mpp_global_sum( domain, field, flags )
 !  <IN NAME="domain" TYPE="type(domain2D)"></IN>
 !  <IN NAME="field" TYPE="MPP_TYPE_">
 !    <TT>field</TT> is dimensioned on either the compute domain or the
@@ -2447,12 +1895,8 @@ module mpp_domains_mod
 !    SRC="mpp.html#mpp_sum"><TT>mpp_sum</TT></LINK> across the domain
 !    decomposition.
 !  </IN>
-!  <NOTE>
-!    All PEs in a domain decomposition must call
-!    <TT>mpp_global_sum</TT>, and each will have the result upon exit.
-!  </NOTE>
-! </INTERFACE>
-
+!> @note All PEs in a domain decomposition must call \e mpp_global_sum,
+!! and each will have the result upon exit.
   interface mpp_global_sum
      module procedure mpp_global_sum_r8_2d
      module procedure mpp_global_sum_r8_3d
@@ -2464,24 +1908,20 @@ module mpp_domains_mod
      module procedure mpp_global_sum_c8_4d
      module procedure mpp_global_sum_c8_5d
 #endif
-#ifdef OVERLOAD_R4
      module procedure mpp_global_sum_r4_2d
      module procedure mpp_global_sum_r4_3d
      module procedure mpp_global_sum_r4_4d
      module procedure mpp_global_sum_r4_5d
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_global_sum_c4_2d
      module procedure mpp_global_sum_c4_3d
      module procedure mpp_global_sum_c4_4d
      module procedure mpp_global_sum_c4_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_global_sum_i8_2d
      module procedure mpp_global_sum_i8_3d
      module procedure mpp_global_sum_i8_4d
      module procedure mpp_global_sum_i8_5d
-#endif
      module procedure mpp_global_sum_i4_2d
      module procedure mpp_global_sum_i4_3d
      module procedure mpp_global_sum_i4_4d
@@ -2500,24 +1940,20 @@ module mpp_domains_mod
      module procedure mpp_global_sum_tl_c8_4d
      module procedure mpp_global_sum_tl_c8_5d
 #endif
-#ifdef OVERLOAD_R4
      module procedure mpp_global_sum_tl_r4_2d
      module procedure mpp_global_sum_tl_r4_3d
      module procedure mpp_global_sum_tl_r4_4d
      module procedure mpp_global_sum_tl_r4_5d
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_global_sum_tl_c4_2d
      module procedure mpp_global_sum_tl_c4_3d
      module procedure mpp_global_sum_tl_c4_4d
      module procedure mpp_global_sum_tl_c4_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_global_sum_tl_i8_2d
      module procedure mpp_global_sum_tl_i8_3d
      module procedure mpp_global_sum_tl_i8_4d
      module procedure mpp_global_sum_tl_i8_5d
-#endif
      module procedure mpp_global_sum_tl_i4_2d
      module procedure mpp_global_sum_tl_i4_3d
      module procedure mpp_global_sum_tl_i4_4d
@@ -2537,24 +1973,20 @@ module mpp_domains_mod
      module procedure mpp_global_sum_ad_c8_4d
      module procedure mpp_global_sum_ad_c8_5d
 #endif
-#ifdef OVERLOAD_R4
      module procedure mpp_global_sum_ad_r4_2d
      module procedure mpp_global_sum_ad_r4_3d
      module procedure mpp_global_sum_ad_r4_4d
      module procedure mpp_global_sum_ad_r4_5d
-#endif
 #ifdef OVERLOAD_C4
      module procedure mpp_global_sum_ad_c4_2d
      module procedure mpp_global_sum_ad_c4_3d
      module procedure mpp_global_sum_ad_c4_4d
      module procedure mpp_global_sum_ad_c4_5d
 #endif
-#ifndef no_8byte_integers
      module procedure mpp_global_sum_ad_i8_2d
      module procedure mpp_global_sum_ad_i8_3d
      module procedure mpp_global_sum_ad_i8_4d
      module procedure mpp_global_sum_ad_i8_5d
-#endif
      module procedure mpp_global_sum_ad_i4_2d
      module procedure mpp_global_sum_ad_i4_3d
      module procedure mpp_global_sum_ad_i4_4d
@@ -2567,55 +1999,38 @@ module mpp_domains_mod
 !            public interface from mpp_domain_util.h
 !
 !***********************************************************************
-
-  ! <INTERFACE NAME="mpp_get_neighbor_pe">
-  !  <OVERVIEW>
-  !    Retrieve PE number of a neighboring domain.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    Given a 1-D or 2-D domain decomposition, this call allows users to retrieve
-  !    the PE number of an adjacent PE-domain while taking into account that the
-  !    domain may have holes (masked) and/or have cyclic boundary conditions and/or a
-  !    folded edge. Which PE-domain will be retrived will depend on "direction":
-  !    +1 (right) or -1 (left) for a 1-D domain decomposition and either NORTH, SOUTH,
-  !    EAST, WEST, NORTH_EAST, SOUTH_EAST, SOUTH_WEST, or NORTH_WEST for a 2-D
-  !    decomposition. If no neighboring domain exists (masked domain), then the
-  !    returned "pe" value will be set to NULL_PE.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_get_neighbor_pe( domain1d, direction=+1   , pe)
-  !    call mpp_get_neighbor_pe( domain2d, direction=NORTH, pe)
-  !  </TEMPLATE>
-  ! </INTERFACE>
+  !> @brief Retrieve PE number of a neighboring domain.
+  !> @detailed Given a 1-D or 2-D domain decomposition, this call allows users to retrieve
+  !! the PE number of an adjacent PE-domain while taking into account that the
+  !! domain may have holes (masked) and/or have cyclic boundary conditions and/or a
+  !! folded edge. Which PE-domain will be retrived will depend on "direction":
+  !! +1 (right) or -1 (left) for a 1-D domain decomposition and either NORTH, SOUTH,
+  !! EAST, WEST, NORTH_EAST, SOUTH_EAST, SOUTH_WEST, or NORTH_WEST for a 2-D
+  !! decomposition. If no neighboring domain exists (masked domain), then the
+  !! returned "pe" value will be set to NULL_PE.
+  !
+  !> @example call mpp_get_neighbor_pe( domain1d, direction=+1   , pe)
+  !> @example call mpp_get_neighbor_pe( domain2d, direction=NORTH, pe)
   interface mpp_get_neighbor_pe
      module procedure mpp_get_neighbor_pe_1d
      module procedure mpp_get_neighbor_pe_2d
   end interface
-  ! <INTERFACE NAME="operator">
-  !  <OVERVIEW>
-  !    Equality/inequality operators for domaintypes.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The module provides public operators to check for
-  !    equality/inequality of domaintypes, e.g:
-  !
-  !    <PRE>
-  !    type(domain1D) :: a, b
-  !    type(domain2D) :: c, d
-  !    ...
-  !    if( a.NE.b )then
-  !        ...
-  !    end if
-  !    if( c==d )then
-  !        ...
-  !    end if
-  !    </PRE>
-  !
-  !    Domains are considered equal if and only if the start and end
-  !    indices of each of their component global, data and compute domains
-  !    are equal.
-  !  </DESCRIPTION>
-  ! </INTERFACE>
+
+  !> @brief Equality/inequality operators for domaintypes.
+  !> @detailed The module provides public operators to check for
+  !! equality/inequality of domaintypes, e.g:\n
+  !!    type(domain1D) :: a, b\n
+  !!    type(domain2D) :: c, d\n
+  !!    ...\n
+  !!    if( a.NE.b )then\n
+  !!        ...\n
+  !!    end if\n
+  !!    if( c==d )then\n
+  !!        ...\n
+  !!    end if\n
+  !!\n
+  !! Domains are considered equal if and only if the start and end
+  !! indices of each of their component global, data and compute domains are equal.
   interface operator(.EQ.)
      module procedure mpp_domain1D_eq
      module procedure mpp_domain2D_eq
@@ -2628,40 +2043,24 @@ module mpp_domains_mod
      module procedure mpp_domainUG_ne
   end interface
 
-  ! <INTERFACE NAME="mpp_get_compute_domain">
-  !  <OVERVIEW>
-  !    These routines retrieve the axis specifications associated with the compute domains.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The domain is a derived type with private elements. These routines
-  !    retrieve the axis specifications associated with the compute domains
-  !    The 2D version of these is a simple extension of 1D.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_get_compute_domain
-  !  </TEMPLATE>
-  ! </INTERFACE>
+  !> @brief These routines retrieve the axis specifications associated with the compute domains.
+  !! @detailed The domain is a derived type with private elements. These routines
+  !! retrieve the axis specifications associated with the compute domains
+  !! The 2D version of these is a simple extension of 1D.
+  !> @example call mpp_get_compute_domain
   interface mpp_get_compute_domain
      module procedure mpp_get_compute_domain1D
      module procedure mpp_get_compute_domain2D
   end interface
 
-  ! <INTERFACE NAME="mpp_get_compute_domains">
-  !  <OVERVIEW>
-  !    Retrieve the entire array of compute domain extents associated with a decomposition.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    Retrieve the entire array of compute domain extents associated with a decomposition.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_get_compute_domains( domain, xbegin, xend, xsize, &
-  !                                                ybegin, yend, ysize )
-  !  </TEMPLATE>
+  !> @brief Retrieve the entire array of compute domain extents associated with a decomposition.
+  !> @detailed Retrieve the entire array of compute domain extents associated with a decomposition.
+  !> @examplecall mpp_get_compute_domains( domain, xbegin, xend, xsize, &\n
+  !!                                                ybegin, yend, ysize )
   !  <IN NAME="domain" TYPE="type(domain2D)"></IN>
   !  <OUT NAME="xbegin,ybegin" TYPE="integer" DIM="(:)"></OUT>
   !  <OUT NAME="xend,yend" TYPE="integer" DIM="(:)"></OUT>
   !  <OUT NAME="xsize,ysize" TYPE="integer" DIM="(:)"></OUT>
-  ! </INTERFACE>
   interface mpp_get_compute_domains
      module procedure mpp_get_compute_domains1D
      module procedure mpp_get_compute_domains2D
@@ -2672,56 +2071,31 @@ module mpp_domains_mod
      module procedure mpp_get_global_domains2D
   end interface
 
-
-  ! <INTERFACE NAME="mpp_get_data_domain">
-  !  <OVERVIEW>
-  !    These routines retrieve the axis specifications associated with the data domains.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The domain is a derived type with private elements. These routines
-  !    retrieve the axis specifications associated with the data domains.
-  !    The 2D version of these is a simple extension of 1D.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_get_data_domain
-  !  </TEMPLATE>
-  ! </INTERFACE>
+  !> @brief These routines retrieve the axis specifications associated with the data domains.
+  !> @detailed The domain is a derived type with private elements. These routines
+  !! retrieve the axis specifications associated with the data domains.
+  !! The 2D version of these is a simple extension of 1D.
+  !> @example call mpp_get_data_domain
   interface mpp_get_data_domain
      module procedure mpp_get_data_domain1D
      module procedure mpp_get_data_domain2D
   end interface
 
-  ! <INTERFACE NAME="mpp_get_global_domain">
-  !  <OVERVIEW>
-  !    These routines retrieve the axis specifications associated with the global domains.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The domain is a derived type with private elements. These routines
-  !    retrieve the axis specifications associated with the global domains.
-  !    The 2D version of these is a simple extension of 1D.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_get_global_domain
-  !  </TEMPLATE>
-  ! </INTERFACE>
+  !> @brief These routines retrieve the axis specifications associated with the global domains.
+  !> @detailed  The domain is a derived type with private elements. These routines
+  !! retrieve the axis specifications associated with the global domains.
+  !! The 2D version of these is a simple extension of 1D.
+  !> @example call mpp_get_global_domain
   interface mpp_get_global_domain
      module procedure mpp_get_global_domain1D
      module procedure mpp_get_global_domain2D
   end interface
 
-  ! <INTERFACE NAME="mpp_get_memory_domain">
-  !  <OVERVIEW>
-  !    These routines retrieve the axis specifications associated with the memory domains.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The domain is a derived type with private elements. These routines
-  !    retrieve the axis specifications associated with the memory domains.
-  !    The 2D version of these is a simple extension of 1D.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_get_memory_domain
-  !  </TEMPLATE>
-  ! </INTERFACE>
+  !> @brief These routines retrieve the axis specifications associated with the memory domains.
+  !> @detailed  The domain is a derived type with private elements. These routines
+  !! retrieve the axis specifications associated with the memory domains.
+  !! The 2D version of these is a simple extension of 1D.
+  !> @example call mpp_get_memory_domain
   interface mpp_get_memory_domain
      module procedure mpp_get_memory_domain1D
      module procedure mpp_get_memory_domain2D
@@ -2732,97 +2106,53 @@ module mpp_domains_mod
      module procedure mpp_get_domain_extents2D
   end interface
 
-  ! <INTERFACE NAME="mpp_set_compute_domain">
-  !  <OVERVIEW>
-  !    These routines set the axis specifications associated with the compute domains.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The domain is a derived type with private elements. These routines
-  !    set the axis specifications associated with the compute domains
-  !    The 2D version of these is a simple extension of 1D.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_set_compute_domain
-  !  </TEMPLATE>
-  ! </INTERFACE>
+  !> @brief These routines set the axis specifications associated with the compute domains.
+  !> @detailed The domain is a derived type with private elements. These routines
+  !! set the axis specifications associated with the compute domains
+  !! The 2D version of these is a simple extension of 1D.
+  !> @example  call mpp_set_compute_domain
   interface mpp_set_compute_domain
      module procedure mpp_set_compute_domain1D
      module procedure mpp_set_compute_domain2D
   end interface
 
-  ! <INTERFACE NAME="mpp_set_data_domain">
-  !  <OVERVIEW>
-  !    These routines set the axis specifications associated with the data domains.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The domain is a derived type with private elements. These routines
-  !    set the axis specifications associated with the data domains.
-  !    The 2D version of these is a simple extension of 1D.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_set_data_domain
-  !  </TEMPLATE>
-  ! </INTERFACE>
+  !> @brief These routines set the axis specifications associated with the data domains.
+  !> @detailed The domain is a derived type with private elements. These routines
+  !! set the axis specifications associated with the data domains.
+  !! The 2D version of these is a simple extension of 1D.
+  !> @example call mpp_set_data_domain
   interface mpp_set_data_domain
      module procedure mpp_set_data_domain1D
      module procedure mpp_set_data_domain2D
   end interface
 
-  ! <INTERFACE NAME="mpp_set_global_domain">
-  !  <OVERVIEW>
-  !    These routines set the axis specifications associated with the global domains.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The domain is a derived type with private elements. These routines
-  !    set the axis specifications associated with the global domains.
-  !    The 2D version of these is a simple extension of 1D.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_set_global_domain
-  !  </TEMPLATE>
-  ! </INTERFACE>
+  !> @brief These routines set the axis specifications associated with the global domains.
+  !> @detailed The domain is a derived type with private elements. These routines
+  !! set the axis specifications associated with the global domains.
+  !! The 2D version of these is a simple extension of 1D.
+  !> @example call mpp_set_global_domain
   interface mpp_set_global_domain
      module procedure mpp_set_global_domain1D
      module procedure mpp_set_global_domain2D
   end interface
 
-
-  ! <INTERFACE NAME="mpp_get_pelist">
-  !  <OVERVIEW>
-  !    Retrieve list of PEs associated with a domain decomposition.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The 1D version of this call returns an array of the PEs assigned to this 1D domain
-  !    decomposition. In addition the optional argument <TT>pos</TT> may be
-  !    used to retrieve the 0-based position of the domain local to the
-  !    calling PE, i.e <TT>domain%list(pos)%pe</TT> is the local PE,
-  !    as returned by <LINK SRC="mpp.html#mpp_pe"><TT>mpp_pe()</TT></LINK>.
-  !    The 2D version of this call is identical to 1D version.
-  !  </DESCRIPTION>
-  !  <IN NAME="domain"></IN>
-  !  <OUT NAME="pelist"></OUT>
-  !  <OUT NAME="pos"></OUT>
-  ! </INTERFACE>
+  !> @brief Retrieve list of PEs associated with a domain decomposition.
+  !> @detailed The 1D version of this call returns an array of the PEs assigned to
+  !! this 1D domain decomposition. In addition the optional argument pos may be
+  !! used to retrieve the 0-based position of the domain local to the
+  !! calling PE, i.e., \e domain%list(pos)%pe is the local PE,
+  !! as returned by mpp_pe()
+  !! The 2D version of this call is identical to 1D version.
   interface mpp_get_pelist
      module procedure mpp_get_pelist1D
      module procedure mpp_get_pelist2D
   end interface
 
-  ! <INTERFACE NAME="mpp_get_layout">
-  !  <OVERVIEW>
-  !    Retrieve layout associated with a domain decomposition.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    The 1D version of this call returns the number of divisions that was assigned to this
-  !    decomposition axis. The 2D version of this call returns an array of
-  !    dimension 2 holding the results on two axes.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_get_layout( domain, layout )
-  !  </TEMPLATE>
-  !  <IN NAME="domain"></IN>
-  !  <OUT NAME="layout"></OUT>
-  ! </INTERFACE>
+  !> @brief Retrieve layout associated with a domain decomposition
+  !> @detailed The 1D version of this call returns the number of divisions that was assigned to this
+  !! decomposition axis. The 2D version of this call returns an array of dimension 2 holding the
+  !! results on two axes.
+  !> @example call mpp_get_layout( domain, layout )
   interface mpp_get_layout
      module procedure mpp_get_layout1D
      module procedure mpp_get_layout2D
@@ -2833,19 +2163,10 @@ module mpp_domains_mod
      module procedure check_data_size_2d
   end interface
 
-  ! <INTERFACE NAME="mpp_nullify_domain_list">
-  !  <OVERVIEW>
-  !    nullify domain list.
-  !  </OVERVIEW>
-  !  <DESCRIPTION>
-  !    Nullify domain list. This interface is needed in mpp_domains_test.
-  !    1-D case can be added in if needed.
-  !  </DESCRIPTION>
-  !  <TEMPLATE>
-  !    call mpp_nullify_domain_list( domain)
-  !  </TEMPLATE>
-  !  <INOUT NAME="domain"></INOUT>
-  ! </INTERFACE>
+  !> @brief nullify domain list.
+  !> @detailed Nullify domain list. This interface is needed in mpp_domains_test.
+  !! 1-D case can be added in if needed.
+  !> @example call mpp_nullify_domain_list(domain)
   interface mpp_nullify_domain_list
      module procedure nullify_domain2d_list
   end interface
@@ -2866,46 +2187,3 @@ module mpp_domains_mod
 #include <mpp_unstruct_domain.inc>
 
 end module mpp_domains_mod
-
-! <INFO>
-
-!   <COMPILER NAME="">
-!     Any module or program unit using <TT>mpp_domains_mod</TT>
-!     must contain the line
-
-!     <PRE>
-!     use mpp_domains_mod
-!     </PRE>
-
-!     <TT>mpp_domains_mod</TT> <TT>use</TT>s <LINK
-!     SRC="mpp.html">mpp_mod</LINK>, and therefore is subject to the <LINK
-!     SRC="mpp.html#COMPILING AND LINKING SOURCE">compiling and linking requirements of that module.</LINK>
-!   </COMPILER>
-!   <PRECOMP FLAG="">
-!     <TT>mpp_domains_mod</TT> uses standard f90, and has no special
-!     requirements. There are some OS-dependent
-!     pre-processor directives that you might need to modify on
-!     non-SGI/Cray systems and compilers. The <LINK
-!     SRC="mpp.html#PORTABILITY">portability of mpp_mod</LINK>
-!     obviously is a constraint, since this module is built on top of
-!     it. Contact me, Balaji, SGI/GFDL, with questions.
-!   </PRECOMP>
-!   <LOADER FLAG="">
-!     The <TT>mpp_domains</TT> source consists of the main source file
-!     <TT>mpp_domains.F90</TT> and also requires the following include files:
-!    <PRE>
-!     <TT>fms_platform.h</TT>
-!     <TT>mpp_update_domains2D.h</TT>
-!     <TT>mpp_global_reduce.h</TT>
-!     <TT>mpp_global_sum.h</TT>
-!     <TT>mpp_global_field.h</TT>
-!    </PRE>
-!    GFDL users can check it out of the main CVS repository as part of
-!    the <TT>mpp</TT> CVS module. The current public tag is <TT>galway</TT>.
-!    External users can download the latest <TT>mpp</TT> package <LINK SRC=
-!    "ftp://ftp.gfdl.gov/pub/vb/mpp/mpp.tar.Z">here</LINK>. Public access
-!    to the GFDL CVS repository will soon be made available.
-
-!   </LOADER>
-
-! </INFO>
diff --git a/mpp/mpp_efp.F90 b/mpp/mpp_efp.F90
index b4439b8088..bc86d0610b 100644
--- a/mpp/mpp_efp.F90
+++ b/mpp/mpp_efp.F90
@@ -16,12 +16,14 @@
 !* You should have received a copy of the GNU Lesser General Public
 !* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
 !***********************************************************************
+!> This module provides interfaces to the non-domain-oriented communication
+!! subroutines.
 module mpp_efp_mod
-#include <fms_platform.h>
 
 use mpp_mod, only : mpp_error, FATAL, WARNING, NOTE
 use mpp_mod, only : mpp_pe, mpp_root_pe, mpp_npes
 use mpp_mod, only : mpp_sum
+use platform_mod
 
 implicit none ; private
 
@@ -30,27 +32,24 @@ module mpp_efp_mod
 public :: operator(+), operator(-), assignment(=)
 public :: mpp_query_efp_overflow_error, mpp_reset_efp_overlow_error
 
-!   This module provides interfaces to the non-domain-oriented communication
-! subroutines.
-integer, parameter :: NUMBIT = 46  ! number of bits used in the 64-bit signed integer representation.
-integer, parameter :: NUMINT = 6   ! The number of long integers to use to represent
-                                   ! a real number.
-
-integer(LONG_KIND), parameter :: prec=2_8**NUMBIT ! The precision of each integer.
-real(DOUBLE_KIND), parameter :: r_prec=2.0_8**NUMBIT  ! A real version of prec.
-real(DOUBLE_KIND), parameter :: I_prec=1.0_8/(2.0_8**NUMBIT) ! The inverse of prec.
-integer, parameter :: max_count_prec=2**(63-NUMBIT)-1
-                              ! The number of values that can be added together
-                              ! with the current value of prec before there will
-                              ! be roundoff problems.
-
-real(DOUBLE_KIND), parameter, dimension(NUMINT) :: &
+integer, parameter :: NUMBIT = 46  !< number of bits used in the 64-bit signed integer representation.
+integer, parameter :: NUMINT = 6   !< The number of long integers to use to represent
+                                   !! a real number.
+
+integer(i8_kind), parameter :: prec=2_8**NUMBIT !< The precision of each integer.
+real(r8_kind), parameter :: r_prec=2.0_8**NUMBIT !< A real version of prec.
+real(r8_kind), parameter :: I_prec=1.0_8/(2.0_8**NUMBIT) !< The inverse of prec.
+integer, parameter :: max_count_prec=2**(63-NUMBIT)-1  !< The number of values that can be added together
+                              !! with the current value of prec before there will
+                              !! be roundoff problems.
+
+real(r8_kind), parameter, dimension(NUMINT) :: &
   pr = (/ r_prec**2, r_prec, 1.0_8, 1.0_8/r_prec, 1.0_8/r_prec**2, 1.0_8/r_prec**3 /)
-real(DOUBLE_KIND), parameter, dimension(NUMINT) :: &
+real(r8_kind), parameter, dimension(NUMINT) :: &
   I_pr = (/ 1.0_8/r_prec**2, 1.0_8/r_prec, 1.0_8, r_prec, r_prec**2, r_prec**3 /)
 
 logical :: overflow_error = .false., NaN_error = .false.
-logical :: debug = .false.    ! Making this true enables debugging output.
+logical :: debug = .false.    !< Making this true enables debugging output.
 
 interface mpp_reproducing_sum
   module procedure mpp_reproducing_sum_r8_2d
@@ -58,10 +57,10 @@ module mpp_efp_mod
   module procedure mpp_reproducing_sum_r4_2d
 end interface mpp_reproducing_sum
 
-! The Extended Fixed Point (mpp_efp) type provides a public interface for doing
-! sums and taking differences with this type.
+!> The Extended Fixed Point (mpp_efp) type provides a public interface for doing
+!! sums and taking differences with this type.
 type, public :: mpp_efp_type ; private
-  integer(kind=8), dimension(NUMINT) :: v
+  integer(i8_kind), dimension(NUMINT) :: v
 end type mpp_efp_type
 
 interface operator (+); module procedure mpp_efp_plus  ; end interface
@@ -70,24 +69,24 @@ module mpp_efp_mod
 
 contains
 
+!> This subroutine uses a conversion to an integer representation
+!! of real numbers to give order-invariant sums that will reproduce
+!! across PE count.
+!> @note This idea comes from R. Hallberg and A. Adcroft.
 function mpp_reproducing_sum_r8_2d(array, isr, ier, jsr, jer, EFP_sum, reproducing, &
                             overflow_check, err) result(sum)
-  real(DOUBLE_KIND), dimension(:,:), intent(in) :: array
+  real(r8_kind), dimension(:,:), intent(in) :: array
   integer,        optional,          intent(in) :: isr, ier, jsr, jer
   type(mpp_efp_type), optional,     intent(out) :: EFP_sum
   logical,        optional,          intent(in) :: reproducing
   logical,        optional,          intent(in) :: overflow_check
   integer,        optional,         intent(out) :: err
-  real(DOUBLE_KIND)                             :: sum  ! Result
+  real(r8_kind)                             :: sum  ! Result
 
-  !   This subroutine uses a conversion to an integer representation
-  ! of real numbers to give order-invariant sums that will reproduce
-  ! across PE count.  This idea comes from R. Hallberg and A. Adcroft.
-
-  integer(LONG_KIND), dimension(NUMINT)  :: ints_sum
-  integer(LONG_KIND) :: ival, prec_error
-  real(DOUBLE_KIND)  :: rsum(1), rs
-  real(DOUBLE_KIND)  :: max_mag_term
+  integer(i8_kind), dimension(NUMINT)  :: ints_sum
+  integer(i8_kind) :: ival, prec_error
+  real(r8_kind)  :: rsum(1), rs
+  real(r8_kind)  :: max_mag_term
   logical :: repro, over_check
   character(len=256) :: mesg
   integer :: i, j, n, is, ie, js, je, sgn
@@ -217,15 +216,15 @@ end function mpp_reproducing_sum_r8_2d
 
 function mpp_reproducing_sum_r4_2d(array, isr, ier, jsr, jer, EFP_sum, reproducing, &
                             overflow_check, err) result(sum)
-  real(FLOAT_KIND), dimension(:,:), intent(in) :: array
+  real(r4_kind), dimension(:,:), intent(in) :: array
   integer,        optional,          intent(in) :: isr, ier, jsr, jer
   type(mpp_efp_type), optional,     intent(out) :: EFP_sum
   logical,        optional,          intent(in) :: reproducing
   logical,        optional,          intent(in) :: overflow_check
   integer,        optional,         intent(out) :: err
-  real(FLOAT_KIND)                             :: sum  ! Result
+  real(r4_kind)                             :: sum  !< Result
 
-  real(DOUBLE_KIND) :: array_r8(size(array,1), size(array,2))
+  real(r8_kind) :: array_r8(size(array,1), size(array,2))
 
   array_r8 = array
 
@@ -236,24 +235,22 @@ function mpp_reproducing_sum_r4_2d(array, isr, ier, jsr, jer, EFP_sum, reproduci
 
 end function mpp_reproducing_sum_r4_2d
 
-
+!> This function uses a conversion to an integer representation
+!! of real numbers to give order-invariant sums that will reproduce
+!! across PE count.  This idea comes from R. Hallberg and A. Adcroft.
 function mpp_reproducing_sum_r8_3d(array, isr, ier, jsr, jer, sums, EFP_sum, err) &
                             result(sum)
-  real(DOUBLE_KIND), dimension(:,:,:),        intent(in) :: array
+  real(r8_kind), dimension(:,:,:),        intent(in) :: array
   integer,    optional,                       intent(in) :: isr, ier, jsr, jer
-  real(DOUBLE_KIND), dimension(:), optional, intent(out) :: sums
+  real(r8_kind), dimension(:), optional, intent(out) :: sums
   type(mpp_efp_type),     optional,          intent(out) :: EFP_sum
   integer,            optional,              intent(out) :: err
-  real(DOUBLE_KIND)                                      :: sum  ! Result
+  real(r8_kind)                                      :: sum  !< Result
 
-  !   This subroutine uses a conversion to an integer representation
-  ! of real numbers to give order-invariant sums that will reproduce
-  ! across PE count.  This idea comes from R. Hallberg and A. Adcroft.
-
-  real(DOUBLE_KIND)    :: max_mag_term
-  integer(LONG_KIND), dimension(NUMINT)  :: ints_sum
-  integer(LONG_KIND), dimension(NUMINT,size(array,3))  :: ints_sums
-  integer(LONG_KIND) :: prec_error
+  real(r8_kind)    :: max_mag_term
+  integer(i8_kind), dimension(NUMINT)  :: ints_sum
+  integer(i8_kind), dimension(NUMINT,size(array,3))  :: ints_sums
+  integer(i8_kind) :: prec_error
   character(len=256) :: mesg
   integer :: i, j, k, is, ie, js, je, ke, isz, jsz, n
 
@@ -402,17 +399,17 @@ function mpp_reproducing_sum_r8_3d(array, isr, ier, jsr, jer, sums, EFP_sum, err
 
 end function mpp_reproducing_sum_r8_3d
 
+!> This function converts a real number to an equivalent representation
+!! using several long integers.
 function real_to_ints(r, prec_error, overflow) result(ints)
-  real(DOUBLE_KIND),            intent(in) :: r
-  integer(LONG_KIND), optional, intent(in) :: prec_error
+  real(r8_kind),            intent(in) :: r
+  integer(i8_kind), optional, intent(in) :: prec_error
   logical,   optional,       intent(inout) :: overflow
-  integer(LONG_KIND),    dimension(NUMINT) :: ints
-  !   This subroutine converts a real number to an equivalent representation
-  ! using several long integers.
+  integer(i8_kind),    dimension(NUMINT) :: ints
 
-  real(DOUBLE_KIND) :: rs
+  real(r8_kind) :: rs
   character(len=80) :: mesg
-  integer(LONG_KIND) :: ival, prec_err
+  integer(i8_kind) :: ival, prec_err
   integer :: sgn, i
 
   prec_err = prec ; if (present(prec_error)) prec_err = prec_error
@@ -438,10 +435,10 @@ function real_to_ints(r, prec_error, overflow) result(ints)
 
 end function real_to_ints
 
+!> This function reverses the conversion in real_to_ints.
 function ints_to_real(ints) result(r)
-  integer(LONG_KIND), dimension(NUMINT), intent(in) :: ints
-  real(DOUBLE_KIND) :: r
-  ! This subroutine reverses the conversion in real_to_ints.
+  integer(i8_kind), dimension(NUMINT), intent(in) :: ints
+  real(r8_kind) :: r
 
   integer :: i
 
@@ -449,13 +446,13 @@ function ints_to_real(ints) result(r)
   do i=1,NUMINT ; r = r + pr(i)*ints(i) ; enddo
 end function ints_to_real
 
+!> This subroutine increments a number with another, both using the integer
+!! representation in real_to_ints.
 subroutine increment_ints(int_sum, int2, prec_error)
-  integer(LONG_KIND), dimension(NUMINT), intent(inout) :: int_sum
-  integer(LONG_KIND), dimension(NUMINT), intent(in)    :: int2
-  integer(LONG_KIND), optional,      intent(in)    :: prec_error
+  integer(i8_kind), dimension(NUMINT), intent(inout) :: int_sum
+  integer(i8_kind), dimension(NUMINT), intent(in)    :: int2
+  integer(i8_kind), optional,      intent(in)    :: prec_error
 
-  ! This subroutine increments a number with another, both using the integer
-  ! representation in real_to_ints.
   integer :: i
 
   do i=NUMINT,2,-1
@@ -478,16 +475,16 @@ subroutine increment_ints(int_sum, int2, prec_error)
 
 end subroutine increment_ints
 
+!> This subroutine increments a number with another, both using the integer
+!! representation in real_to_ints, but without doing any carrying of overflow.
+!! The entire operation is embedded in a single call for greater speed.
 subroutine increment_ints_faster(int_sum, r, max_mag_term)
-  integer(LONG_KIND), dimension(NUMINT), intent(inout) :: int_sum
-  real(DOUBLE_KIND),                        intent(in) :: r
-  real(DOUBLE_KIND),                     intent(inout) :: max_mag_term
-
-  ! This subroutine increments a number with another, both using the integer
-  ! representation in real_to_ints, but without doing any carrying of overflow.
-  ! The entire operation is embedded in a single call for greater speed.
-  real(DOUBLE_KIND) :: rs
-  integer(LONG_KIND) :: ival
+  integer(i8_kind), dimension(NUMINT), intent(inout) :: int_sum
+  real(r8_kind),                        intent(in) :: r
+  real(r8_kind),                     intent(inout) :: max_mag_term
+
+  real(r8_kind) :: rs
+  integer(i8_kind) :: ival
   integer :: sgn, i
 
   if ((r >= 1e30) .eqv. (r < 1e30)) then ; NaN_error = .true. ; return ; endif
@@ -503,11 +500,11 @@ subroutine increment_ints_faster(int_sum, r, max_mag_term)
 
 end subroutine increment_ints_faster
 
+!> This subroutine handles carrying of the overflow.
 subroutine carry_overflow(int_sum, prec_error)
-  integer(LONG_KIND), dimension(NUMINT), intent(inout) :: int_sum
-  integer(LONG_KIND),                intent(in)    :: prec_error
+  integer(i8_kind), dimension(NUMINT), intent(inout) :: int_sum
+  integer(i8_kind),                intent(in)    :: prec_error
 
-  ! This subroutine handles carrying of the overflow.
   integer :: i, num_carry
 
   do i=NUMINT,2,-1 ; if (abs(int_sum(i)) > prec) then
@@ -521,11 +518,11 @@ subroutine carry_overflow(int_sum, prec_error)
 
 end subroutine carry_overflow
 
+!> This subroutine carries the overflow, and then makes sure that
+!! all integers are of the same sign as the overall value.
 subroutine regularize_ints(int_sum)
-  integer(LONG_KIND), dimension(NUMINT), intent(inout) :: int_sum
+  integer(i8_kind), dimension(NUMINT), intent(inout) :: int_sum
 
-  ! This subroutine carries the overflow, and then makes sure that
-  ! all integers are of the same sign as the overall value.
   logical :: positive
   integer :: i, num_carry
 
@@ -586,20 +583,20 @@ function mpp_efp_minus(EFP1, EFP2)
   call increment_ints(mpp_efp_minus%v(:), EFP1%v(:))
 end function mpp_efp_minus
 
+!> This subroutine assigns all components of the extended fixed point type
+!! variable on the RHS (EFP2) to the components of the variable on the LHS
+!! (EFP1).
 subroutine mpp_efp_assign(EFP1, EFP2)
   type(mpp_efp_type), intent(out) :: EFP1
   type(mpp_efp_type), intent(in)  :: EFP2
   integer i
-  ! This subroutine assigns all components of the extended fixed point type
-  ! variable on the RHS (EFP2) to the components of the variable on the LHS
-  ! (EFP1).
 
   do i=1,NUMINT ; EFP1%v(i) = EFP2%v(i) ; enddo
 end subroutine mpp_efp_assign
 
 function mpp_efp_to_real(EFP1)
   type(mpp_efp_type), intent(inout) :: EFP1
-  real(DOUBLE_KIND) :: mpp_efp_to_real
+  real(r8_kind) :: mpp_efp_to_real
 
   call regularize_ints(EFP1%v)
   mpp_efp_to_real = ints_to_real(EFP1%v)
@@ -607,7 +604,7 @@ end function mpp_efp_to_real
 
 function mpp_efp_real_diff(EFP1, EFP2)
   type(mpp_efp_type), intent(in) :: EFP1, EFP2
-  real(DOUBLE_KIND) :: mpp_efp_real_diff
+  real(r8_kind) :: mpp_efp_real_diff
 
   type(mpp_efp_type)             :: EFP_diff
 
@@ -617,7 +614,7 @@ function mpp_efp_real_diff(EFP1, EFP2)
 end function mpp_efp_real_diff
 
 function mpp_real_to_efp(val, overflow)
-  real(DOUBLE_KIND),    intent(in) :: val
+  real(r8_kind),    intent(in) :: val
   logical, optional, intent(inout) :: overflow
   type(mpp_efp_type)               :: mpp_real_to_efp
 
@@ -637,16 +634,15 @@ function mpp_real_to_efp(val, overflow)
 
 end function mpp_real_to_efp
 
+!> This subroutine does a sum across PEs of a list of EFP variables,
+!! returning the sums in place, with all overflows carried.
 subroutine mpp_efp_list_sum_across_PEs(EFPs, nval, errors)
   type(mpp_efp_type), dimension(:), intent(inout) :: EFPs
   integer, intent(in) :: nval
   logical, dimension(:), optional, intent(out) :: errors
 
-  !   This subroutine does a sum across PEs of a list of EFP variables,
-  ! returning the sums in place, with all overflows carried.
-
-  integer(LONG_KIND), dimension(NUMINT,nval) :: ints
-  integer(LONG_KIND) :: prec_error
+  integer(i8_kind), dimension(NUMINT,nval) :: ints
+  integer(i8_kind) :: prec_error
   logical :: error_found
   character(len=256) :: mesg
   integer :: i, n
diff --git a/mpp/mpp_io.F90 b/mpp/mpp_io.F90
index 9d1ca80768..9023759bca 100644
--- a/mpp/mpp_io.F90
+++ b/mpp/mpp_io.F90
@@ -172,7 +172,7 @@
 !      integer :: type, len
 !      character(len=128) :: name
 !      character(len=256)  :: catt
-!      real(FLOAT_KIND), pointer :: fatt(:)
+!      real(r4_kind), pointer :: fatt(:)
 !   end type atttype
 !   </PRE>
 !
@@ -313,7 +313,6 @@
 
 module mpp_io_mod
 
-#include <fms_platform.h>
 #define _MAX_FILE_UNITS 1024
 
 #ifdef use_netCDF
@@ -356,6 +355,7 @@ module mpp_io_mod
                            mpp_get_io_domain_UG_layout, &
                            mpp_get_UG_compute_domain, &
                            mpp_get_UG_domain_pelist
+use platform_mod
 !----------
 
 implicit none
@@ -445,7 +445,7 @@ module mpp_io_mod
      character(len=256)      :: standard_name   ! CF standard name
      real                    :: min, max, missing, fill, scale, add
      integer                 :: pack
-     integer(LONG_KIND), dimension(3) :: checksum
+     integer(i8_kind), dimension(3) :: checksum
      type(axistype), pointer :: axes(:) =>NULL() !axes associated with field size, time_axis_index redundantly
                                         !hold info already contained in axes. it's clunky and inelegant,
                                         !but required so that axes can be shared among multiple files
@@ -462,14 +462,14 @@ module mpp_io_mod
      integer            :: action, format, access, threading, fileset, record, ncid
      logical            :: opened, initialized, nohdrs
      integer            :: time_level
-     real(DOUBLE_KIND)  :: time
+     real(r8_kind)  :: time
      logical            :: valid
      logical            :: write_on_this_pe   ! indicate if will write out from this pe
      logical            :: read_on_this_pe    ! indicate if will read from this pe
      logical            :: io_domain_exist    ! indicate if io_domain exist or not.
      integer            :: id       !variable ID of time axis associated with file (only one time axis per file)
      integer            :: recdimid !dim ID of time axis associated with file (only one time axis per file)
-     real(DOUBLE_KIND), pointer :: time_values(:) =>NULL() ! time axis values are stored here instead of axis%data
+     real(r8_kind), pointer :: time_values(:) =>NULL() ! time axis values are stored here instead of axis%data
                                                   ! since mpp_write assumes these values are not time values.
                                                   ! Not used in mpp_write
      ! additional elements of filetype for mpp_read (ignored for mpp_write)
@@ -593,13 +593,12 @@ module mpp_io_mod
      module procedure mpp_read_text
      module procedure mpp_read_region_r2D
      module procedure mpp_read_region_r3D
-#ifdef OVERLOAD_R8
      module procedure mpp_read_region_r2D_r8
      module procedure mpp_read_region_r3D_r8
-     module procedure mpp_read_2ddecomp_r2d_r8
-     module procedure mpp_read_2ddecomp_r3d_r8
-     module procedure mpp_read_2ddecomp_r4d_r8
-#endif
+     !! added for mixed prec
+     module procedure mpp_read_2ddecomp_r2d_r4
+     module procedure mpp_read_2ddecomp_r3d_r4
+     module procedure mpp_read_2ddecomp_r4d_r4
   end interface
 
 !***********************************************************************
@@ -900,20 +899,16 @@ module mpp_io_mod
 
   interface write_record
      module procedure write_record_default
-#ifdef OVERLOAD_R8
-     module procedure write_record_r8
-#endif
+     module procedure write_record_r4
   end interface
 
   interface mpp_write
      module procedure mpp_write_2ddecomp_r2d
      module procedure mpp_write_2ddecomp_r3d
      module procedure mpp_write_2ddecomp_r4d
-#ifdef OVERLOAD_R8
-     module procedure mpp_write_2ddecomp_r2d_r8
-     module procedure mpp_write_2ddecomp_r3d_r8
-     module procedure mpp_write_2ddecomp_r4d_r8
-#endif
+     module procedure mpp_write_2ddecomp_r2d_r4
+     module procedure mpp_write_2ddecomp_r3d_r4
+     module procedure mpp_write_2ddecomp_r4d_r4
      module procedure mpp_write_r0D
      module procedure mpp_write_r1D
      module procedure mpp_write_r2D
@@ -1057,7 +1052,7 @@ module mpp_io_mod
   namelist /mpp_io_nml/header_buffer_val, global_field_on_root_pe, io_clocks_on, &
                        shuffle, deflate_level, cf_compliance
 
-  real(DOUBLE_KIND), allocatable :: mpp_io_stack(:)
+  real(r8_kind), allocatable :: mpp_io_stack(:)
   type(axistype),save            :: default_axis      !provided to users with default components
   type(fieldtype),save           :: default_field     !provided to users with default components
   type(atttype),save             :: default_att       !provided to users with default components
diff --git a/mpp/mpp_memutils.F90 b/mpp/mpp_memutils.F90
index 557e0fa3ea..0ac6c89d62 100644
--- a/mpp/mpp_memutils.F90
+++ b/mpp/mpp_memutils.F90
@@ -20,8 +20,6 @@
 !> Routines to initialize and report on memory usage during the model run.
 module mpp_memutils_mod
 
-#include "../include/fms_platform.h"
-
   use mpp_mod, only: mpp_min, mpp_max, mpp_sum, mpp_pe, mpp_root_pe
   use mpp_mod, only: mpp_error, FATAL, stderr, mpp_npes, get_unit
 
diff --git a/mpp/mpp_parameter.F90 b/mpp/mpp_parameter.F90
index 1a237c33d9..e74cef008a 100644
--- a/mpp/mpp_parameter.F90
+++ b/mpp/mpp_parameter.F90
@@ -17,7 +17,7 @@
 !* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
 !***********************************************************************
 module mpp_parameter_mod
-#include <fms_platform.h>
+  use platform_mod
 
   implicit none
   private
@@ -73,10 +73,10 @@ module mpp_parameter_mod
   integer, parameter :: MPP_CLOCK_SYNC=1, MPP_CLOCK_DETAILED=2
   integer            :: DEFAULT_TAG = 1
   !--- implimented to centralize _FILL_ values for land_model.F90 into mpp_mod
-  !------- instead of netcdf definitions, use manual assignments
-  integer(INT_KIND) , parameter :: MPP_FILL_INT    = -2147483647            !NF_FILL_INT
-  real(DOUBLE_KIND) , parameter :: MPP_FILL_DOUBLE = 9.9692099683868690e+36 !NF_FILL_DOUBLE
-  real(FLOAT_KIND)  , parameter :: MPP_FILL_FLOAT  = 9.9692099683868690e+36 !NF_FILL_DOUBLE
+  !------- instead of multiple includes of netcdf.inc and manual assignments
+  integer(i4_kind) , parameter :: MPP_FILL_INT    = -2147483647            !NF_FILL_INT
+  real(r8_kind) , parameter :: MPP_FILL_DOUBLE = 9.9692099683868690e+36 !NF_FILL_DOUBLE
+  real(r4_kind)  , parameter :: MPP_FILL_FLOAT  = 9.9692099683868690e+36 !NF_FILL_DOUBLE
   !--- predefined clock granularities, but you can use any integer
   !--- using CLOCK_LOOP and above may distort coarser-grain measurements
   integer, parameter :: CLOCK_COMPONENT=1      !component level, e.g model, exchange
@@ -88,7 +88,7 @@ module mpp_parameter_mod
   integer, parameter :: CLOCK_LOOP=51          !loops or blocks within a routine
   integer, parameter :: CLOCK_INFRA=61         !infrastructure level, e.g halo update
   integer, parameter :: MAX_BINS=20
-  integer(LONG_KIND), parameter :: MPP_WAIT=-1, MPP_READY=-2
+  integer(i8_kind), parameter :: MPP_WAIT=-1, MPP_READY=-2
 
   !--- The following paramters are used by mpp_domains_mod and its components.
   integer, parameter :: GLOBAL=0, CYCLIC=1
@@ -116,8 +116,8 @@ module mpp_parameter_mod
 ! DOMAIN_ID_BASE acts as a counter increment for domains as they are defined. It's used in
 ! combination with the flag parameter defined above to create a unique identifier for
 ! each Domain+flags combination. Therefore, the value of any flag must not exceed DOMAIN_ID_BASE.
-! integer(LONG_KIND), parameter :: DOMAIN_ID_BASE=INT( 2**(4*LONG_KIND),KIND=LONG_KIND )
-  integer(LONG_KIND), parameter :: DOMAIN_ID_BASE = int(Z'0000000100000000', kind=LONG_KIND)
+! integer(i8_kind), parameter :: DOMAIN_ID_BASE=INT( 2**(4*i8_kind),KIND=i8_kind )
+  integer(i8_kind), parameter :: DOMAIN_ID_BASE = int(Z'0000000100000000', kind=i8_kind)
   integer, parameter :: NON_BITWISE_EXACT_SUM=0
   integer, parameter :: BITWISE_EXACT_SUM=1
   integer, parameter :: BITWISE_EFP_SUM=2
@@ -143,11 +143,11 @@ module mpp_parameter_mod
 
   integer, parameter :: ROOT_GLOBAL = 9
   integer, parameter :: GLOBAL_ROOT_ONLY = 2**ROOT_GLOBAL
-  real(DOUBLE_KIND), parameter :: NULLTIME=-1.
+  real(r8_kind), parameter :: NULLTIME=-1.
 #ifdef LARGE_FILE
-  integer(LONG_KIND), parameter :: MAX_FILE_SIZE = 4294967295
+  integer(i8_kind), parameter :: MAX_FILE_SIZE = 4294967295
 #else
-  integer(LONG_KIND), parameter :: MAX_FILE_SIZE = 2147483647
+  integer(i8_kind), parameter :: MAX_FILE_SIZE = 2147483647
 #endif
 
   !#####################################################################
diff --git a/platform/platform.F90 b/platform/platform.F90
index b253d45730..e061580682 100644
--- a/platform/platform.F90
+++ b/platform/platform.F90
@@ -24,6 +24,7 @@ module platform_mod
   integer, parameter :: r8_kind=DOUBLE_KIND, r4_kind=FLOAT_KIND, &
                         c8_kind=DOUBLE_KIND, c4_kind=FLOAT_KIND, &
                         l8_kind=LONG_KIND, l4_kind=INT_KIND, &
-                        i8_kind=LONG_KIND, i4_kind=INT_KIND, i2_kind=SHORT_KIND
+                        i8_kind=LONG_KIND, i4_kind=INT_KIND, i2_kind=SHORT_KIND, &
+                        ptr_kind=POINTER_KIND
 !could additionally define things like OS, compiler...: useful?
 end module platform_mod
diff --git a/sat_vapor_pres/Makefile.am b/sat_vapor_pres/Makefile.am
index b011d3fef0..7db9b4dfd8 100644
--- a/sat_vapor_pres/Makefile.am
+++ b/sat_vapor_pres/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build these uninstalled convenience library.
 noinst_LTLIBRARIES = libsat_vapor_pres_k.la libsat_vapor_pres.la
diff --git a/test_fms/Makefile.am b/test_fms/Makefile.am
index e266245c2c..38bc099637 100644
--- a/test_fms/Makefile.am
+++ b/test_fms/Makefile.am
@@ -25,8 +25,8 @@ ACLOCAL_AMFLAGS = -I m4
 
 # Make targets will be run in each subdirectory. Order is significant.
 SUBDIRS = diag_manager data_override exchange monin_obukhov drifters \
-mosaic interpolator fms fms2_io mpp mpp_io time_interp time_manager \
-horiz_interp field_manager axis_utils affinity
+mosaic interpolator fms mpp mpp_io time_interp time_manager \
+horiz_interp field_manager axis_utils affinity fms2_io
 
 # This input file must be distributed, it is turned into
 # test_common.sh by configure.
diff --git a/test_fms/affinity/Makefile.am b/test_fms/affinity/Makefile.am
index 01fab4df4f..f7f8a4343d 100644
--- a/test_fms/affinity/Makefile.am
+++ b/test_fms/affinity/Makefile.am
@@ -23,7 +23,7 @@
 # uramirez
 
 # Find the fms and mpp mod files.
-AM_CPPFLAGS = -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/axis_utils/Makefile.am b/test_fms/axis_utils/Makefile.am
index 2315350336..637b932cb7 100644
--- a/test_fms/axis_utils/Makefile.am
+++ b/test_fms/axis_utils/Makefile.am
@@ -23,7 +23,8 @@
 # uramirez, Ed Hartnett
 
 # Find the fms and mpp mod files.
-AM_CPPFLAGS = -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_builddir}/.mod \
+  -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/data_override/test_data_override.F90 b/test_fms/data_override/test_data_override.F90
index 0fe4139dc5..cb398a0d19 100644
--- a/test_fms/data_override/test_data_override.F90
+++ b/test_fms/data_override/test_data_override.F90
@@ -87,8 +87,7 @@ program test
   use mpp_domains_mod, only : mpp_get_UG_compute_domain, mpp_pass_SG_to_UG, mpp_pass_UG_to_SG
   use mpp_domains_mod, only : mpp_get_ug_global_domain, mpp_global_field_ug
   use mpp_memutils_mod, only : mpp_memuse_begin, mpp_memuse_end
-
-#include "../../include/fms_platform.h"
+  use platform_mod
 
  implicit none
 
@@ -664,7 +663,7 @@ end subroutine test_unstruct_grid
   subroutine compare_checksums( a, b, string )
     real, intent(in), dimension(:,:,:) :: a, b
     character(len=*), intent(in) :: string
-    integer(LONG_KIND) :: sum1, sum2
+    integer(i8_kind) :: sum1, sum2
     integer :: i, j, k,pe
 
     ! z1l can not call mpp_sync here since there might be different number of tiles on each pe.
@@ -709,7 +708,7 @@ end subroutine compare_checksums
   subroutine compare_checksums_2D( a, b, string )
     real, intent(in), dimension(:,:) :: a, b
     character(len=*), intent(in) :: string
-    integer(LONG_KIND) :: sum1, sum2
+    integer(i8_kind) :: sum1, sum2
     integer :: i, j,pe
 
     ! z1l can not call mpp_sync here since there might be different number of tiles on each pe.
diff --git a/test_fms/diag_manager/test_diag_manager.F90 b/test_fms/diag_manager/test_diag_manager.F90
index ae5b18758b..86a76700a8 100644
--- a/test_fms/diag_manager/test_diag_manager.F90
+++ b/test_fms/diag_manager/test_diag_manager.F90
@@ -245,7 +245,8 @@ PROGRAM test
   USE diag_manager_mod, ONLY: diag_field_add_cell_measures
   USE diag_manager_mod, ONLY: get_diag_field_id, DIAG_FIELD_NOT_FOUND
   USE diag_axis_mod, ONLY: get_axis_num
-#include "fms_platform.h"
+  USE platform_mod
+
   IMPLICIT NONE
 
   TYPE(domain2d) :: Domain1
@@ -290,29 +291,30 @@ PROGRAM test
   INTEGER :: id_nv, id_nv_init
 
 !!!!!! Stuff for unstrctured grid
-    integer(INT_KIND)              :: nx = 8                               !<Total number of grid points in the x-dimension (longitude?)
-    integer(INT_KIND)              :: ny = 8                               !<Total number of grid points in the y-dimension (latitude?)
-    integer(INT_KIND)              :: nz = 2                               !<Total number of grid points in the z-dimension (height)
-    integer(INT_KIND)              :: nt = 2                               !<Total number of time grid points.
-    integer(INT_KIND)              :: io_tile_factor = 1                   !< The IO tile factor
-    integer(INT_KIND)              :: halo = 2                             !<Number of grid points in the halo???
-    integer(INT_KIND)              :: ntiles_x = 1                         !<Number of tiles in the x-direction (A 2D grid of tiles is used in this test.)
-    integer(INT_KIND)              :: ntiles_y = 2                         !<Number of tiles in the y-direction (A 2D grid of tiles is used in this test.)
-    integer(INT_KIND)              :: total_num_tiles                      !<The total number of tiles for the run (= ntiles_x*ntiles_y)
-    integer(INT_KIND)              :: stackmax = 1500000                   !<Default size to which the mpp stack will be set.
-    integer(INT_KIND)              :: stackmaxd = 500000                   !<Default size to which the mpp_domains stack will be set.
-    logical(INT_KIND)              :: debug = .false.                      !<Flag to print debugging information.
+    integer(kind=i4_kind)              :: nx = 8                               !<Total number of grid points in the x-dimension (longitude?)
+    integer(kind=i4_kind)              :: ny = 8                               !<Total number of grid points in the y-dimension (latitude?)
+    integer(kind=i4_kind)              :: nz = 2                               !<Total number of grid points in the z-dimension (height)
+    integer(kind=i4_kind)              :: nt = 2                               !<Total number of time grid points.
+    integer(kind=i4_kind)              :: io_tile_factor = 1                   !< The IO tile factor
+    integer(kind=i4_kind)              :: halo = 2                             !<Number of grid points in the halo???
+    integer(kind=i4_kind)              :: ntiles_x = 1                         !<Number of tiles in the x-direction (A 2D grid of tiles is used in this test.)
+    integer(kind=i4_kind)              :: ntiles_y = 2                         !<Number of tiles in the y-direction (A 2D grid of tiles is used in this test.)
+    integer(kind=i4_kind)              :: total_num_tiles                      !<The total number of tiles for the run (= ntiles_x*ntiles_y)
+    integer(kind=i4_kind)              :: stackmax = 1500000                   !<Default size to which the mpp stack will be set.
+    integer(kind=i4_kind)              :: stackmaxd = 500000                   !<Default size to which the mpp_domains stack will be set.
+    logical(kind=l4_kind)              :: debug = .false.                      !<Flag to print debugging information.
     character(len=64)              :: test_file = "test_unstructured_grid" !<Base filename for the unit tests.
-    integer(INT_KIND)              :: pack_size = 1                        !<(Number of bits in real(DOUBLE_KIND))/(Number of bits in real)
-    integer(INT_KIND)              :: npes                                 !<Total number of ranks in the current pelist.
-    integer(INT_KIND)              :: io_status                            !<Namelist read error code.
-    real(DOUBLE_KIND)              :: doubledata = 0.0                     !<Used to determine pack_size.  This must be kind=DOUBLE_KIND.
+    character(len=64)              :: iospec = '-F cachea'                 !<Something cray related ???
+    integer(kind=i4_kind)              :: pack_size = 1                        !<(Number of bits in real(kind=r8_kind))/(Number of bits in real)
+    integer(kind=i4_kind)              :: npes                                 !<Total number of ranks in the current pelist.
+    integer(kind=i4_kind)              :: io_status                            !<Namelist read error code.
+    real(kind=r8_kind)              :: doubledata = 0.0                     !<Used to determine pack_size.  This must be kind=r8_kind.
     real                           :: realdata = 0.0                       !<Used to determine pack_size.  Do not specify a kind parameter.
-    integer(INT_KIND)              :: funit = 7                            !<File unit.
-    logical(INT_KIND)              :: fopened                              !<Flag telling if a file is already open.
+    integer(kind=i4_kind)              :: funit = 7                            !<File unit.
+    logical(kind=l4_kind)              :: fopened                              !<Flag telling if a file is already open.
     type(time_type)                :: diag_time                            !<
 
-    integer(INT_KIND)              :: output_unit=6
+    integer(kind=i4_kind)              :: output_unit=6
 !!!!!!
 
 
@@ -1054,38 +1056,38 @@ SUBROUTINE unstruct_test(nx, ny, nz, npes, num_domain_tiles_x, num_domain_tiles_
         implicit none
 
        !Inputs/Ouputs
-        integer(INT_KIND),intent(in)  :: nx                 !<The number of grid points in the x-direction.
-        integer(INT_KIND),intent(in)  :: ny                 !<The number of grid points in the y-direction.
-        integer(INT_KIND),intent(in)  :: nz                 !<The number of grid points in the z-direction.
-        integer(INT_KIND),intent(in)  :: npes               !<The total number of ranks used in this test.
-        integer(INT_KIND),intent(in)  :: num_domain_tiles_x !<The total number of domain tiles in the x-dimension for the 2D structured domain in this test.
-        integer(INT_KIND),intent(in)  :: num_domain_tiles_y !<The total number of domain tiles in the y-dimension for the 2D structured domain in this test.
+        integer(kind=i4_kind),intent(in)  :: nx                 !<The number of grid points in the x-direction.
+        integer(kind=i4_kind),intent(in)  :: ny                 !<The number of grid points in the y-direction.
+        integer(kind=i4_kind),intent(in)  :: nz                 !<The number of grid points in the z-direction.
+        integer(kind=i4_kind),intent(in)  :: npes               !<The total number of ranks used in this test.
+        integer(kind=i4_kind),intent(in)  :: num_domain_tiles_x !<The total number of domain tiles in the x-dimension for the 2D structured domain in this test.
+        integer(kind=i4_kind),intent(in)  :: num_domain_tiles_y !<The total number of domain tiles in the y-dimension for the 2D structured domain in this test.
         type(time_type),intent(inout) :: diag_time          !<Time for diag_manager.
-        integer(INT_KIND),intent(in)  :: io_tile_factor     !<I/O tile factor.  See below.
+        integer(kind=i4_kind),intent(in)  :: io_tile_factor     !<I/O tile factor.  See below.
 
        !Local variables
-        integer(INT_KIND)                              :: num_domain_tiles                           !<The total number of domain tiles for the 2D structured domain in this test.
-        integer(INT_KIND)                              :: npes_per_domain_tile                       !<The number of ranks per domain tile for the 2D structured domain.
-        integer(INT_KIND)                              :: my_domain_tile_id                          !<The 2D structured domain tile id for the current rank.
-        logical(INT_KIND)                              :: is_domain_tile_root                        !<Flag telling if the current rank is the root rank of its associated 2D structured domain tile.
-        integer(INT_KIND),dimension(2)                 :: layout_for_full_domain                     !<Rank layout (2D grid) for the full 2D structured domain. Example: 16 ranks -> (16,1) or (8,2) or (4,4) or (2,8) or (1,16)
-        integer(INT_KIND),dimension(:),allocatable     :: pe_start                                   !<Array holding the smallest rank id assigned to each 2D structured domain tile.
-        integer(INT_KIND),dimension(:),allocatable     :: pe_end                                     !<Array holding the largest rank id assigned to each 2D structured domain tile.
-        integer(INT_KIND)                              :: x_grid_points_per_domain_tile              !<The number of grid points in the x-dimension on each 2D structured domain tile.
-        integer(INT_KIND)                              :: y_grid_points_per_domain_tile              !<The number of grid points in the y-dimension on each 2D structured domain tile.
-        integer(INT_KIND),dimension(:,:),allocatable   :: global_indices                             !<Required to define the 2D structured domain.
-        integer(INT_KIND),dimension(:,:),allocatable   :: layout2D                                   !<Required to define the 2D structured domain.
+        integer(kind=i4_kind)                              :: num_domain_tiles                           !<The total number of domain tiles for the 2D structured domain in this test.
+        integer(kind=i4_kind)                              :: npes_per_domain_tile                       !<The number of ranks per domain tile for the 2D structured domain.
+        integer(kind=i4_kind)                              :: my_domain_tile_id                          !<The 2D structured domain tile id for the current rank.
+        logical(kind=l4_kind)                              :: is_domain_tile_root                        !<Flag telling if the current rank is the root rank of its associated 2D structured domain tile.
+        integer(kind=i4_kind),dimension(2)                 :: layout_for_full_domain                     !<Rank layout (2D grid) for the full 2D structured domain. Example: 16 ranks -> (16,1) or (8,2) or (4,4) or (2,8) or (1,16)
+        integer(kind=i4_kind),dimension(:),allocatable     :: pe_start                                   !<Array holding the smallest rank id assigned to each 2D structured domain tile.
+        integer(kind=i4_kind),dimension(:),allocatable     :: pe_end                                     !<Array holding the largest rank id assigned to each 2D structured domain tile.
+        integer(kind=i4_kind)                              :: x_grid_points_per_domain_tile              !<The number of grid points in the x-dimension on each 2D structured domain tile.
+        integer(kind=i4_kind)                              :: y_grid_points_per_domain_tile              !<The number of grid points in the y-dimension on each 2D structured domain tile.
+        integer(kind=i4_kind),dimension(:,:),allocatable   :: global_indices                             !<Required to define the 2D structured domain.
+        integer(kind=i4_kind),dimension(:,:),allocatable   :: layout2D                                   !<Required to define the 2D structured domain.
         type(domain2D)                                 :: domain_2D                                  !<A structured 2D domain.
-        logical(INT_KIND),dimension(:,:,:),allocatable :: land_mask                                  !<A toy mask.
-        integer(INT_KIND),dimension(:),allocatable     :: num_non_masked_grid_points_per_domain_tile !<Total number of non-masked grid points on each 2D structured domain tile.
-        integer(INT_KIND)                              :: mask_counter                               !<Counting variable.
-        integer(INT_KIND)                              :: num_non_masked_grid_points                 !<Total number of non-masked grid points for the 2D structured domain.
-        integer(INT_KIND),dimension(:),allocatable     :: num_land_tiles_per_non_masked_grid_point   !<Number of land tiles per non-masked grid point for the 2D structured domain.
-        integer(INT_KIND)                              :: num_ranks_using_unstructured_grid          !<Number of ranks using the unstructured domain.
-        integer(INT_KIND),dimension(:),allocatable     :: unstructured_grid_point_index_map          !<Array that maps indices between the 2D structured and unstructured domains.
+        logical(kind=l4_kind),dimension(:,:,:),allocatable :: land_mask                                  !<A toy mask.
+        integer(kind=i4_kind),dimension(:),allocatable     :: num_non_masked_grid_points_per_domain_tile !<Total number of non-masked grid points on each 2D structured domain tile.
+        integer(kind=i4_kind)                              :: mask_counter                               !<Counting variable.
+        integer(kind=i4_kind)                              :: num_non_masked_grid_points                 !<Total number of non-masked grid points for the 2D structured domain.
+        integer(kind=i4_kind),dimension(:),allocatable     :: num_land_tiles_per_non_masked_grid_point   !<Number of land tiles per non-masked grid point for the 2D structured domain.
+        integer(kind=i4_kind)                              :: num_ranks_using_unstructured_grid          !<Number of ranks using the unstructured domain.
+        integer(kind=i4_kind),dimension(:),allocatable     :: unstructured_grid_point_index_map          !<Array that maps indices between the 2D structured and unstructured domains.
         type(domainUG)                                 :: domain_ug                                  !<An unstructured mpp domain.
-        integer(INT_KIND),dimension(:),allocatable     :: unstructured_axis_data                     !<Data that is registered to the restart file for the unstructured axis.
-        integer(INT_KIND)                              :: unstructured_axis_data_size                !<Size of the unstructured axis data array.
+        integer(kind=i4_kind),dimension(:),allocatable     :: unstructured_axis_data                     !<Data that is registered to the restart file for the unstructured axis.
+        integer(kind=i4_kind)                              :: unstructured_axis_data_size                !<Size of the unstructured axis data array.
         character(len=256)                             :: unstructured_axis_name                     !<Name for the unstructured axis.
         real,dimension(:),allocatable                  :: x_axis_data                                !<Data for the x-axis that is registered to the restart file.
         real,dimension(:),allocatable                  :: y_axis_data                                !<Data for the y-axis that is registered to the restart file.
@@ -1112,41 +1114,41 @@ SUBROUTINE unstruct_test(nx, ny, nz, npes, num_domain_tiles_x, num_domain_tiles_
         character(len=256)                             :: unstructured_int_2D_field_name             !<Name for an unstructured integer 2D field.
         character(len=100)                             :: unstructured_1d_alt                       !<Name of the unstrucutred 1D field if L>1
         integer,dimension(:,:),allocatable             :: unstructured_int_2D_field_data             !<Data for an unstructured integer 2D field.
-       integer(INT_KIND),allocatable,dimension(:)      :: unstructured_axis_diag_id                  !<Id returned for the unstructured axis by diag_axis_init.
-       integer(INT_KIND)                               :: x_axis_diag_id                             !<Id returned for the x-axis by diag_axis_init.
-       integer(INT_KIND)                              :: y_axis_diag_id                             !<Id returned for the y-axis by diag_axis_init.
-       integer(INT_KIND)                              :: z_axis_diag_id                             !<Id returned for the z-axis by diag_axis_init.
+       integer(kind=i4_kind),allocatable,dimension(:)      :: unstructured_axis_diag_id                  !<Id returned for the unstructured axis by diag_axis_init.
+       integer(kind=i4_kind)                               :: x_axis_diag_id                             !<Id returned for the x-axis by diag_axis_init.
+       integer(kind=i4_kind)                              :: y_axis_diag_id                             !<Id returned for the y-axis by diag_axis_init.
+       integer(kind=i4_kind)                              :: z_axis_diag_id                             !<Id returned for the z-axis by diag_axis_init.
        real,allocatable,dimension(:) :: lat, lon
-       integer(INT_KIND)             :: idlat
-       integer(INT_KIND)                              :: idlon
-       integer(INT_KIND)                              :: rsf_diag_id                                !<Id returned for a real scalar field associated with the unstructured grid by
+       integer(kind=i4_kind)             :: idlat
+       integer(kind=i4_kind)                              :: idlon
+       integer(kind=i4_kind)                              :: rsf_diag_id                                !<Id returned for a real scalar field associated with the unstructured grid by
                                 !!register_diag_field.
-       integer(INT_KIND),allocatable,dimension(:)     :: rsf_diag_1d_id                             !<Id returned for a real 1D array  field associated with the unstructured grid by                                                                                                     !!register_diag_field.
-       integer(INT_KIND)                              :: rsf_diag_2d_id                             !<Id returned for a real 2D array  field associated with the unstructured grid by                                                                                                     !!register_diag_field.
-        integer(INT_KIND)                              :: num_diag_time_steps                        !<Number of timesteps (to simulate the model running).
+       integer(kind=i4_kind),allocatable,dimension(:)     :: rsf_diag_1d_id                             !<Id returned for a real 1D array  field associated with the unstructured grid by                                                                                                     !!register_diag_field.
+       integer(kind=i4_kind)                              :: rsf_diag_2d_id                             !<Id returned for a real 2D array  field associated with the unstructured grid by                                                                                                     !!register_diag_field.
+        integer(kind=i4_kind)                              :: num_diag_time_steps                        !<Number of timesteps (to simulate the model running).
         type(time_type)                                :: diag_time_start                            !<Starting time for the test.
         type(time_type)                                :: diag_time_step                             !<Time step for the test.
-        logical(INT_KIND)                              :: used                                       !<Return value from send data.
+        logical(kind=l4_kind)                              :: used                                       !<Return value from send data.
 
-        integer(INT_KIND)                              :: i                                          !<Loop variable.
-        integer(INT_KIND)                              :: j                                          !<Loop variable.
-        integer(INT_KIND)                              :: k,l=1                                          !<Loop variable.
-        integer(INT_KIND)                              :: p                                          !<Counting variable.
+        integer(kind=i4_kind)                              :: i                                          !<Loop variable.
+        integer(kind=i4_kind)                              :: j                                          !<Loop variable.
+        integer(kind=i4_kind)                              :: k,l=1                                          !<Loop variable.
+        integer(kind=i4_kind)                              :: p                                          !<Counting variable.
 
        !Needed to define the 2D structured domain but never used.
-        integer(INT_KIND)              :: ncontacts
-        integer(INT_KIND),dimension(20) :: tile1
-        integer(INT_KIND),dimension(20) :: tile2
-        integer(INT_KIND),dimension(20) :: istart1
-        integer(INT_KIND),dimension(20) :: iend1
-        integer(INT_KIND),dimension(20) :: jstart1
-        integer(INT_KIND),dimension(20) :: jend1
-        integer(INT_KIND),dimension(20) :: istart2
-        integer(INT_KIND),dimension(20) :: iend2
-        integer(INT_KIND),dimension(20) :: jstart2
-        integer(INT_KIND),dimension(20) :: jend2
-
-        integer(INT_KIND),dimension(3)  :: npes_io_group
+        integer(kind=i4_kind)              :: ncontacts
+        integer(kind=i4_kind),dimension(20) :: tile1
+        integer(kind=i4_kind),dimension(20) :: tile2
+        integer(kind=i4_kind),dimension(20) :: istart1
+        integer(kind=i4_kind),dimension(20) :: iend1
+        integer(kind=i4_kind),dimension(20) :: jstart1
+        integer(kind=i4_kind),dimension(20) :: jend1
+        integer(kind=i4_kind),dimension(20) :: istart2
+        integer(kind=i4_kind),dimension(20) :: iend2
+        integer(kind=i4_kind),dimension(20) :: jstart2
+        integer(kind=i4_kind),dimension(20) :: jend2
+
+        integer(kind=i4_kind),dimension(3)  :: npes_io_group
 
        !Print out a message that the test is starting.
         if (mpp_pe() .eq. mpp_root_pe()) then
diff --git a/test_fms/drifters/Makefile.am b/test_fms/drifters/Makefile.am
index 2dc7fc70ea..25fadae290 100644
--- a/test_fms/drifters/Makefile.am
+++ b/test_fms/drifters/Makefile.am
@@ -23,7 +23,9 @@
 # uramirez, Ed Hartnett
 
 # Find the needed mod and .inc files.
-AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_srcdir}/include \
+  -I${top_builddir}/.mod \
+  -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/drifters/test_drifters_core.F90 b/test_fms/drifters/test_drifters_core.F90
index a2b54184eb..b2b93a5e82 100644
--- a/test_fms/drifters/test_drifters_core.F90
+++ b/test_fms/drifters/test_drifters_core.F90
@@ -23,6 +23,8 @@ program test_drifters_core
   use drifters_core_mod
   use fms_mod, only : fms_init, fms_end
   use mpp_mod, only : mpp_error, FATAL, stdout
+  use platform_mod
+
   implicit none
   type(drifters_core_type) :: drf
   integer :: ier, nd, npdim, i, j, np
diff --git a/test_fms/exchange/Makefile.am b/test_fms/exchange/Makefile.am
index 010c3aaa29..d934a0159f 100644
--- a/test_fms/exchange/Makefile.am
+++ b/test_fms/exchange/Makefile.am
@@ -23,7 +23,7 @@
 # uramirez, Ed Hartnett
 
 # Find the needed mod and .inc files.
-AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/.mod -I${top_builddir}/platform
 
 # Link to FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/exchange/test_xgrid.F90 b/test_fms/exchange/test_xgrid.F90
index d933107a81..9ededa988b 100644
--- a/test_fms/exchange/test_xgrid.F90
+++ b/test_fms/exchange/test_xgrid.F90
@@ -46,9 +46,9 @@ program xgrid_test
   use gradient_mod,    only : calc_cubic_grid_info
   use ensemble_manager_mod, only : ensemble_manager_init, ensemble_pelist_setup
   use ensemble_manager_mod, only : get_ensemble_size
+  use platform_mod
 
 implicit none
-#include <fms_platform.h>
 
   real, parameter :: EPSLN = 1.0e-10
   character(len=256) :: atm_input_file  = "INPUT/atmos_input.nc"
@@ -965,7 +965,7 @@ end subroutine test_unstruct_exchange
   subroutine compare_chksum_2D( a, b, string )
     real, intent(in), dimension(:,:) :: a, b
     character(len=*), intent(in) :: string
-    integer(LONG_KIND) :: sum1, sum2
+    integer(i8_kind) :: sum1, sum2
     integer :: i, j
 
     call mpp_sync_self()
@@ -1002,7 +1002,7 @@ end subroutine compare_chksum_2D
   subroutine compare_chksum( a, b, string )
     real, intent(in), dimension(:,:,:) :: a, b
     character(len=*), intent(in) :: string
-    integer(LONG_KIND) :: sum1, sum2
+    integer(i8_kind) :: sum1, sum2
     integer :: i, j, k
 
     ! z1l can not call mpp_sync here since there might be different number of tiles on each pe.
diff --git a/test_fms/field_manager/Makefile.am b/test_fms/field_manager/Makefile.am
index d291635a96..cf28e3e8f8 100644
--- a/test_fms/field_manager/Makefile.am
+++ b/test_fms/field_manager/Makefile.am
@@ -23,7 +23,8 @@
 # uramirez, Ed Hartnett
 
 # Find the needed mod files.
-AM_CPPFLAGS = -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_builddir}/.mod \
+  -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/fms/Makefile.am b/test_fms/fms/Makefile.am
index 8386425657..9701513f24 100644
--- a/test_fms/fms/Makefile.am
+++ b/test_fms/fms/Makefile.am
@@ -23,7 +23,7 @@
 # uramirez, Ed Hartnett
 
 # Find the fms_mod.mod file.
-AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/.mod -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/fms/test_fms_io.F90 b/test_fms/fms/test_fms_io.F90
index ecb4fc1ea1..cae9029585 100644
--- a/test_fms/fms/test_fms_io.F90
+++ b/test_fms/fms/test_fms_io.F90
@@ -18,7 +18,6 @@
 !***********************************************************************
 
  program test_fms_io
-#include <fms_platform.h>
 
  use mpp_mod,         only: mpp_pe, mpp_npes, mpp_root_pe, mpp_init, mpp_exit
  use mpp_mod,         only: stdout, mpp_error, FATAL, NOTE, mpp_chksum
@@ -32,6 +31,7 @@ program test_fms_io
  use fms_io_mod,      only: file_exist, register_restart_field, save_restart, restore_state
  use fms_io_mod,      only: restart_file_type
  use mpp_io_mod,      only: MAX_FILE_SIZE
+ use platform_mod
 
  implicit none
 
@@ -479,7 +479,7 @@ end subroutine copy_restart_data
   subroutine compare_data_r5d( a, b, string )
     real, intent(in), dimension(:,:,:,:,:) :: a, b
     character(len=*), intent(in)         :: string
-    integer(LONG_KIND)                   :: sum1, sum2
+    integer(i8_kind)                   :: sum1, sum2
     integer                              :: i, j, k, l, n
     integer, parameter                   :: stdunit = 6
 
@@ -519,7 +519,7 @@ end subroutine compare_data_r5d
   subroutine compare_data_r4d( a, b, string )
     real, intent(in), dimension(:,:,:,:) :: a, b
     character(len=*), intent(in)         :: string
-    integer(LONG_KIND)                   :: sum1, sum2
+    integer(i8_kind)                   :: sum1, sum2
     integer                              :: i, j, k, l
     integer, parameter                   :: stdunit = 6
 
@@ -568,7 +568,7 @@ end subroutine compare_data_i4d
   subroutine compare_data_r3d( a, b, string )
     real, intent(in), dimension(:,:,:) :: a, b
     character(len=*), intent(in)       :: string
-    integer(LONG_KIND)                 :: sum1, sum2
+    integer(i8_kind)                 :: sum1, sum2
     integer                            :: i, j, l
     integer, parameter                 :: stdunit = 6
 
@@ -615,7 +615,7 @@ end subroutine compare_data_i3d
   subroutine compare_data_r2d( a, b, string )
     real, intent(in), dimension(:,:) :: a, b
     character(len=*), intent(in)     :: string
-    integer(LONG_KIND)               :: sum1, sum2
+    integer(i8_kind)               :: sum1, sum2
     integer                          :: i, l
     integer, parameter               :: stdunit = 6
 
@@ -659,7 +659,7 @@ end subroutine compare_data_i2d
   subroutine compare_data_r1d( a, b, string )
     real, intent(in), dimension(:) :: a, b
     character(len=*), intent(in)   :: string
-    integer(LONG_KIND)             :: sum1, sum2
+    integer(i8_kind)             :: sum1, sum2
     integer                        :: l
     integer, parameter             :: stdunit = 6
 
diff --git a/test_fms/fms/test_unstructured_fms_io.F90 b/test_fms/fms/test_unstructured_fms_io.F90
index dd15be50f0..c7cf80f2a9 100644
--- a/test_fms/fms/test_unstructured_fms_io.F90
+++ b/test_fms/fms/test_unstructured_fms_io.F90
@@ -18,7 +18,7 @@
 !***********************************************************************
 
 program test_unstructured_fms_io
-#include <fms_platform.h>
+
     use,intrinsic :: iso_fortran_env, only: output_unit
     use mpp_parameter_mod,            only: FATAL, &
                                             NOTE, &
@@ -34,9 +34,7 @@ program test_unstructured_fms_io
                                             mpp_clock_begin, &
                                             mpp_clock_end, &
                                             mpp_clock_id
-#ifdef INTERNAL_FILE_NML
     use mpp_mod,                      only: input_nml_file
-#endif
     use mpp_domains_mod,              only: mpp_domains_init, &
                                             mpp_domains_set_stack_size, &
                                             mpp_domains_exit, &
@@ -46,6 +44,8 @@ program test_unstructured_fms_io
                                             mpp_io_exit
     use fms_io_mod,                   only: fms_io_init, &
                                             fms_io_exit
+    use platform_mod
+
 #ifdef use_netCDF
     use netcdf
     use netcdf_nf_data
@@ -56,39 +56,40 @@ program test_unstructured_fms_io
     implicit none
 
    !Local variables
-    integer(INT_KIND)              :: nx = 8                               !<Total number of grid points in the x-dimension (longitude?)
-    integer(INT_KIND)              :: ny = 8                               !<Total number of grid points in the y-dimension (latitude?)
-    integer(INT_KIND)              :: nz = 2                               !<Total number of grid points in the z-dimension (height)
-!   integer(INT_KIND)              :: nx = 128                             !<Total number of grid points in the x-dimension (longitude?)
-!   integer(INT_KIND)              :: ny = 128                             !<Total number of grid points in the y-dimension (latitude?)
-!   integer(INT_KIND)              :: nz = 40                              !<Total number of grid points in the z-dimension (height)
-    integer(INT_KIND)              :: nt = 2                               !<Total number of time grid points.
-    integer(INT_KIND)              :: halo = 2                             !<Number of grid points in the halo???
-    integer(INT_KIND)              :: ntiles_x = 1                         !<Number of tiles in the x-direction (A 2D grid of tiles is used in this test.)
-    integer(INT_KIND)              :: ntiles_y = 2                         !<Number of tiles in the y-direction (A 2D grid of tiles is used in this test.)
-    integer(INT_KIND)              :: total_num_tiles                      !<The total number of tiles for the run (= ntiles_x*ntiles_y)
-    integer(INT_KIND),dimension(2) :: layout(2) = (/1,1/)                  !<Rank layout (x,y of a 2D grid) for each tile.  Total number of ranks = total_num_tiles*layout(1)*layout(2)
-    integer(INT_KIND),dimension(2) :: io_layout(2) = (/1,1/)               !<Layout (x,y of a 2D grid) for ranks that will perform I/O for each tile.  These dimensions must divide equally
+    integer(i4_kind)              :: nx = 8                               !<Total number of grid points in the x-dimension (longitude?)
+    integer(i4_kind)              :: ny = 8                               !<Total number of grid points in the y-dimension (latitude?)
+    integer(i4_kind)              :: nz = 2                               !<Total number of grid points in the z-dimension (height)
+!   integer(i4_kind)              :: nx = 128                             !<Total number of grid points in the x-dimension (longitude?)
+!   integer(i4_kind)              :: ny = 128                             !<Total number of grid points in the y-dimension (latitude?)
+!   integer(i4_kind)              :: nz = 40                              !<Total number of grid points in the z-dimension (height)
+    integer(i4_kind)              :: nt = 2                               !<Total number of time grid points.
+    integer(i4_kind)              :: halo = 2                             !<Number of grid points in the halo???
+    integer(i4_kind)              :: ntiles_x = 1                         !<Number of tiles in the x-direction (A 2D grid of tiles is used in this test.)
+    integer(i4_kind)              :: ntiles_y = 2                         !<Number of tiles in the y-direction (A 2D grid of tiles is used in this test.)
+    integer(i4_kind)              :: total_num_tiles                      !<The total number of tiles for the run (= ntiles_x*ntiles_y)
+    integer(i4_kind),dimension(2) :: layout(2) = (/1,1/)                  !<Rank layout (x,y of a 2D grid) for each tile.  Total number of ranks = total_num_tiles*layout(1)*layout(2)
+    integer(i4_kind),dimension(2) :: io_layout(2) = (/1,1/)               !<Layout (x,y of a 2D grid) for ranks that will perform I/O for each tile.  These dimensions must divide equally
                                                                            !!into the dimensions of the layout array.
-    integer(INT_KIND)              :: stackmax = 1500000                   !<Default size to which the mpp stack will be set.
-    integer(INT_KIND)              :: stackmaxd = 500000                   !<Default size to which the mpp_domains stack will be set.
-    logical(INT_KIND)              :: debug = .false.                      !<Flag to print debugging information.
+    integer(i4_kind)              :: stackmax = 1500000                   !<Default size to which the mpp stack will be set.
+    integer(i4_kind)              :: stackmaxd = 500000                   !<Default size to which the mpp_domains stack will be set.
+    logical(i4_kind)              :: debug = .false.                      !<Flag to print debugging information.
     character(len=64)              :: test_file = "test_unstructured_grid" !<Base filename for the unit tests.
-    integer(INT_KIND)              :: pack_size = 1                        !<(Number of bits in real(DOUBLE_KIND))/(Number of bits in real)
-    integer(INT_KIND)              :: npes                                 !<Total number of ranks in the current pelist.
-    integer(INT_KIND)              :: io_status                            !<Namelist read error code.
-    real(DOUBLE_KIND)              :: doubledata = 0.0                     !<Used to determine pack_size.  This must be kind=DOUBLE_KIND.
+    character(len=64)              :: iospec = '-F cachea'                 !<Something cray related ???
+    integer(i4_kind)              :: pack_size = 1                        !<(Number of bits in real(r8_kind))/(Number of bits in real)
+    integer(i4_kind)              :: npes                                 !<Total number of ranks in the current pelist.
+    integer(i4_kind)              :: io_status                            !<Namelist read error code.
+    real(r8_kind)              :: doubledata = 0.0                     !<Used to determine pack_size.  This must be kind=r8_kind.
     real                           :: realdata = 0.0                       !<Used to determine pack_size.  Do not specify a kind parameter.
-    integer(INT_KIND)              :: funit = 7                            !<File unit.
-    logical(INT_KIND)              :: fopened                              !<Flag telling if a file is already open.
+    integer(i4_kind)              :: funit = 7                            !<File unit.
+    logical(i4_kind)              :: fopened                              !<Flag telling if a file is already open.
     type(domain2D)                 :: structured_domain                    !<A structured 2D domain.
     type(domainUG)                 :: unstructured_domain                  !<An unstructured mpp domain.
-    integer(INT_KIND)              :: test_num                             !<Which test to perform.
-    integer(INT_KIND),parameter    :: test_1_id = 1                        !<Test 1 id.
-!   integer(INT_KIND)              :: id_single_tile_mult_file             !<Mpp timer id.
-!   integer(INT_KIND)              :: id_mult_tile                         !<Mpp timer id.
-!   integer(INT_KIND)              :: id_single_tile_with_group            !<Mpp timer id.
-!   integer(INT_KIND)              :: id_mult_tile_with_group              !<Mpp timer id.
+    integer(i4_kind)              :: test_num                             !<Which test to perform.
+    integer(i4_kind),parameter    :: test_1_id = 1                        !<Test 1 id.
+!   integer(i4_kind)              :: id_single_tile_mult_file             !<Mpp timer id.
+!   integer(i4_kind)              :: id_mult_tile                         !<Mpp timer id.
+!   integer(i4_kind)              :: id_single_tile_with_group            !<Mpp timer id.
+!   integer(i4_kind)              :: id_mult_tile_with_group              !<Mpp timer id.
 
     namelist /test_unstructured_io_nml/ nx, &
                                         ny, &
@@ -259,54 +260,54 @@ subroutine create_mpp_domains(nx, &
                                                  mpp_define_unstruct_domain
 
        !Inputs/Ouputs
-        integer(INT_KIND),intent(in) :: nx                  !<The number of grid points in the x-direction.
-        integer(INT_KIND),intent(in) :: ny                  !<The number of grid points in the y-direction.
-        integer(INT_KIND),intent(in) :: nz                  !<The number of grid points in the z-direction.
-        integer(INT_KIND),intent(in) :: npes                !<The total number of ranks used in this test.
-        integer(INT_KIND),intent(in) :: num_domain_tiles_x  !<The total number of domain tiles in the x-dimension for the 2D structured domain in this test.
-        integer(INT_KIND),intent(in) :: num_domain_tiles_y  !<The total number of domain tiles in the y-dimension for the 2D structured domain in this test.
+        integer(i4_kind),intent(in) :: nx                  !<The number of grid points in the x-direction.
+        integer(i4_kind),intent(in) :: ny                  !<The number of grid points in the y-direction.
+        integer(i4_kind),intent(in) :: nz                  !<The number of grid points in the z-direction.
+        integer(i4_kind),intent(in) :: npes                !<The total number of ranks used in this test.
+        integer(i4_kind),intent(in) :: num_domain_tiles_x  !<The total number of domain tiles in the x-dimension for the 2D structured domain in this test.
+        integer(i4_kind),intent(in) :: num_domain_tiles_y  !<The total number of domain tiles in the y-dimension for the 2D structured domain in this test.
         type(domain2D),intent(inout) :: structured_domain   !<A structured 2D domain.
         type(domainUG),intent(inout) :: unstructured_domain !<An unstructured mpp domain.
 
        !Local variables
-        integer(INT_KIND)                              :: num_domain_tiles                           !<The total number of domain tiles for the 2D structured domain in this test.
-        integer(INT_KIND)                              :: npes_per_domain_tile                       !<The number of ranks per domain tile for the 2D structured domain.
-        integer(INT_KIND)                              :: my_domain_tile_id                          !<The 2D structured domain tile id for the current rank.
-        logical(INT_KIND)                              :: is_domain_tile_root                        !<Flag telling if the current rank is the root rank of its associated
+        integer(i4_kind)                              :: num_domain_tiles                           !<The total number of domain tiles for the 2D structured domain in this test.
+        integer(i4_kind)                              :: npes_per_domain_tile                       !<The number of ranks per domain tile for the 2D structured domain.
+        integer(i4_kind)                              :: my_domain_tile_id                          !<The 2D structured domain tile id for the current rank.
+        logical(l4_kind)                              :: is_domain_tile_root                        !<Flag telling if the current rank is the root rank of its associated
                                                                                                      !!2D structured domain tile.
-        integer(INT_KIND),dimension(2)                 :: layout_for_full_domain                     !<Rank layout (2D grid) for the full 2D structured domain.
+        integer(i4_kind),dimension(2)                 :: layout_for_full_domain                     !<Rank layout (2D grid) for the full 2D structured domain.
                                                                                                      !!Example: 16 ranks -> (16,1) or (8,2) or (4,4) or (2,8) or (1,16)
-        integer(INT_KIND),dimension(:),allocatable     :: pe_start                                   !<Array holding the smallest rank id assigned to each 2D structured domain tile.
-        integer(INT_KIND),dimension(:),allocatable     :: pe_end                                     !<Array holding the largest rank id assigned to each 2D structured domain tile.
-        integer(INT_KIND)                              :: x_grid_points_per_domain_tile              !<The number of grid points in the x-dimension on each 2D structured domain tile.
-        integer(INT_KIND)                              :: y_grid_points_per_domain_tile              !<The number of grid points in the y-dimension on each 2D structured domain tile.
-        integer(INT_KIND),dimension(:,:),allocatable   :: global_indices                             !<Required to define the 2D structured domain.
-        integer(INT_KIND),dimension(:,:),allocatable   :: layout2D                                   !<Required to define the 2D structured domain.
-        logical(INT_KIND),dimension(:,:,:),allocatable :: land_mask                                  !<A toy mask.
-        integer(INT_KIND),dimension(:),allocatable     :: num_non_masked_grid_points_per_domain_tile !<Total number of non-masked grid points on each 2D structured domain tile.
-        integer(INT_KIND)                              :: mask_counter                               !<Counting variable.
-        integer(INT_KIND)                              :: num_non_masked_grid_points                 !<Total number of non-masked grid points for the 2D structured domain.
-        integer(INT_KIND),dimension(:),allocatable     :: num_land_tiles_per_non_masked_grid_point   !<Number of land tiles per non-masked grid point for the 2D structured domain.
-        integer(INT_KIND)                              :: num_ranks_using_unstructured_grid          !<Number of ranks using the unstructured domain.
-        integer(INT_KIND)                              :: io_tile_factor                             !<I/O tile factor.  See below.
-        integer(INT_KIND),dimension(:),allocatable     :: unstructured_grid_point_index_map          !<Array that maps indices between the 2D structured and unstructured domains.
-        integer(INT_KIND)                              :: i                                          !<Loop variable.
-        integer(INT_KIND)                              :: j                                          !<Loop variable.
-        integer(INT_KIND)                              :: k                                          !<Loop variable.
-        integer(INT_KIND)                              :: p                                          !<Counting variable.
+        integer(i4_kind),dimension(:),allocatable     :: pe_start                                   !<Array holding the smallest rank id assigned to each 2D structured domain tile.
+        integer(i4_kind),dimension(:),allocatable     :: pe_end                                     !<Array holding the largest rank id assigned to each 2D structured domain tile.
+        integer(i4_kind)                              :: x_grid_points_per_domain_tile              !<The number of grid points in the x-dimension on each 2D structured domain tile.
+        integer(i4_kind)                              :: y_grid_points_per_domain_tile              !<The number of grid points in the y-dimension on each 2D structured domain tile.
+        integer(i4_kind),dimension(:,:),allocatable   :: global_indices                             !<Required to define the 2D structured domain.
+        integer(i4_kind),dimension(:,:),allocatable   :: layout2D                                   !<Required to define the 2D structured domain.
+        logical(l4_kind),dimension(:,:,:),allocatable :: land_mask                                  !<A toy mask.
+        integer(i4_kind),dimension(:),allocatable     :: num_non_masked_grid_points_per_domain_tile !<Total number of non-masked grid points on each 2D structured domain tile.
+        integer(i4_kind)                              :: mask_counter                               !<Counting variable.
+        integer(i4_kind)                              :: num_non_masked_grid_points                 !<Total number of non-masked grid points for the 2D structured domain.
+        integer(i4_kind),dimension(:),allocatable     :: num_land_tiles_per_non_masked_grid_point   !<Number of land tiles per non-masked grid point for the 2D structured domain.
+        integer(i4_kind)                              :: num_ranks_using_unstructured_grid          !<Number of ranks using the unstructured domain.
+        integer(i4_kind)                              :: io_tile_factor                             !<I/O tile factor.  See below.
+        integer(i4_kind),dimension(:),allocatable     :: unstructured_grid_point_index_map          !<Array that maps indices between the 2D structured and unstructured domains.
+        integer(i4_kind)                              :: i                                          !<Loop variable.
+        integer(i4_kind)                              :: j                                          !<Loop variable.
+        integer(i4_kind)                              :: k                                          !<Loop variable.
+        integer(i4_kind)                              :: p                                          !<Counting variable.
 
        !Needed to define the 2D structured domain but not otherwised used.
-        integer(INT_KIND)              :: ncontacts
-        integer(INT_KIND),dimension(1) :: tile1
-        integer(INT_KIND),dimension(1) :: tile2
-        integer(INT_KIND),dimension(1) :: istart1
-        integer(INT_KIND),dimension(1) :: iend1
-        integer(INT_KIND),dimension(1) :: jstart1
-        integer(INT_KIND),dimension(1) :: jend1
-        integer(INT_KIND),dimension(1) :: istart2
-        integer(INT_KIND),dimension(1) :: iend2
-        integer(INT_KIND),dimension(1) :: jstart2
-        integer(INT_KIND),dimension(1) :: jend2
+        integer(i4_kind)              :: ncontacts
+        integer(i4_kind),dimension(1) :: tile1
+        integer(i4_kind),dimension(1) :: tile2
+        integer(i4_kind),dimension(1) :: istart1
+        integer(i4_kind),dimension(1) :: iend1
+        integer(i4_kind),dimension(1) :: jstart1
+        integer(i4_kind),dimension(1) :: jend1
+        integer(i4_kind),dimension(1) :: istart2
+        integer(i4_kind),dimension(1) :: iend2
+        integer(i4_kind),dimension(1) :: jstart2
+        integer(i4_kind),dimension(1) :: jend2
 
        !Print out a message that the routine is starting.
         if (mpp_pe() .eq. mpp_root_pe()) then
@@ -639,30 +640,30 @@ subroutine test_1(unstructured_domain, &
 
        !Inputs/Ouputs
         type(domainUG),intent(in)    :: unstructured_domain !<An unstructured mpp domain.
-        integer(INT_KIND),intent(in) :: num_restarts        !<Number of times to "restart" the "model run".
-        integer(INT_KIND),intent(in) :: nx                  !<The number of grid points in the x-direction.
-        integer(INT_KIND),intent(in) :: ny                  !<The number of grid points in the y-direction.
-        integer(INT_KIND),intent(in) :: nz                  !<The number of grid points in the z-direction.
-        integer(INT_KIND),intent(in) :: npes                !<The total number of ranks used in this test.
+        integer(i4_kind),intent(in) :: num_restarts        !<Number of times to "restart" the "model run".
+        integer(i4_kind),intent(in) :: nx                  !<The number of grid points in the x-direction.
+        integer(i4_kind),intent(in) :: ny                  !<The number of grid points in the y-direction.
+        integer(i4_kind),intent(in) :: nz                  !<The number of grid points in the z-direction.
+        integer(i4_kind),intent(in) :: npes                !<The total number of ranks used in this test.
 
        !Local variables
         type(domainUG),pointer                     :: io_domain                                !<Pointer to unstructured domain's I/O domain.
-        integer(INT_KIND)                          :: io_domain_npes                           !<The total number of ranks in the unstructured I/O domain pelist.
-        integer(INT_KIND),dimension(:),allocatable :: pelist                                   !<A pelist.
+        integer(i4_kind)                          :: io_domain_npes                           !<The total number of ranks in the unstructured I/O domain pelist.
+        integer(i4_kind),dimension(:),allocatable :: pelist                                   !<A pelist.
         character(len=256)                         :: restart_file_name                        !<Name for the restart file.
         real,dimension(:),allocatable              :: x_axis_data                              !<Data for the x-axis that is registered to the restart file.
         real,dimension(:),allocatable              :: y_axis_data                              !<Data for the y-axis that is registered to the restart file.
         real,dimension(:),allocatable              :: z_axis_data                              !<Data for the z-axis that is registered to the restart file.
-        integer(INT_KIND)                          :: cc_axis_size                             !<Size of the cc-axis (???).
+        integer(i4_kind)                          :: cc_axis_size                             !<Size of the cc-axis (???).
         real,dimension(:),allocatable              :: cc_axis_data                             !<Data for the cc-axis (???) that is registered to the restart file.
-        integer(INT_KIND)                          :: compressed_c_axis_size                   !<Size of the compressed c (???) axis.:
-        integer(INT_KIND),dimension(:),allocatable :: compressed_c_axis_data                   !<Data that is registered to the restart file for the compressed c (???) axis.
-        integer(INT_KIND)                          :: compressed_h_axis_size                   !<Size of the compressed c (???) axis.:
-        integer(INT_KIND),dimension(:),allocatable :: compressed_h_axis_data                   !<Data that is registered to the restart file for the compressed c (???) axis.
-        integer(INT_KIND),dimension(:),allocatable :: compressed_c_axis_size_per_rank          !<Array of "compressed c" axis sizes for each rank.
-        integer(INT_KIND),dimension(:),allocatable :: compressed_h_axis_size_per_rank          !<Array of "compressed h" axis sizes for each rank.
+        integer(i4_kind)                          :: compressed_c_axis_size                   !<Size of the compressed c (???) axis.:
+        integer(i4_kind),dimension(:),allocatable :: compressed_c_axis_data                   !<Data that is registered to the restart file for the compressed c (???) axis.
+        integer(i4_kind)                          :: compressed_h_axis_size                   !<Size of the compressed c (???) axis.:
+        integer(i4_kind),dimension(:),allocatable :: compressed_h_axis_data                   !<Data that is registered to the restart file for the compressed c (???) axis.
+        integer(i4_kind),dimension(:),allocatable :: compressed_c_axis_size_per_rank          !<Array of "compressed c" axis sizes for each rank.
+        integer(i4_kind),dimension(:),allocatable :: compressed_h_axis_size_per_rank          !<Array of "compressed h" axis sizes for each rank.
         type(restart_file_type)                    :: restart_file                             !<A restart file.
-        integer(INT_KIND)                          :: register_id                              !<Id returned from a registered field.
+        integer(i4_kind)                          :: register_id                              !<Id returned from a registered field.
         character(len=256)                         :: real_scalar_field_name                   !<Name for a real scalar field.
         real                                       :: real_scalar_field_data                   !<Data for a real scalar field.
         character(len=256)                         :: compressed_c_real_1D_field_name          !<Name for a "compressed c" real 1D field.
@@ -717,23 +718,23 @@ subroutine test_1(unstructured_domain, &
         integer,dimension(:,:,:),allocatable       :: compressed_h_z_int_2D_field_data_ref     !<Reference test data for a "compressed h, z" integer 2D field.
         integer,dimension(:,:,:),allocatable       :: compressed_c_cc_int_2D_field_data_ref    !<Reference test data for a "compressed c, cc" integer 2D field.
         integer,dimension(:,:,:),allocatable       :: compressed_h_cc_int_2D_field_data_ref    !<Reference test data for a "compressed h, cc" integer 2D field.
-        integer(INT_KIND),dimension(5)             :: field_dimension_sizes                    !<Array to hold the dimensions of fields when they are read back in.
-        logical(INT_KIND)                          :: field_found_in_file                      !<Flag telling if a field was found in a file.
+        integer(i4_kind),dimension(5)             :: field_dimension_sizes                    !<Array to hold the dimensions of fields when they are read back in.
+        logical(l4_kind)                          :: field_found_in_file                      !<Flag telling if a field was found in a file.
         real,dimension(:),allocatable              :: real_buffer_1D                           !<Buffer used to read back in the data.
         real,dimension(:,:),allocatable            :: real_buffer_2D                           !<Buffer used to read back in the data.
         real,dimension(:,:,:),allocatable          :: real_buffer_3D                           !<Buffer used to read back in the data.
         integer,dimension(:),allocatable           :: int_buffer_1D                            !<Buffer used to read back in the data.
         integer,dimension(:,:),allocatable         :: int_buffer_2D                            !<Buffer used to read back in the data.
-        integer(INT_KIND)                          :: offset                                   !<Offset used to check the read in data.
+        integer(i4_kind)                          :: offset                                   !<Offset used to check the read in data.
         real                                       :: rmax_error                               !<Error for real data.
         integer                                    :: imax_error                               !<Error for integer data.
-        integer(LONG_KIND)                         :: read_in_chksum                           !<Check-sum for read in data.
-        integer(LONG_KIND)                         :: ref_chksum                               !<Check-sum for reference test data.
-        integer(INT_KIND)                          :: funit                                    !<File unit used to close a file.
-        integer(INT_KIND)                          :: i                                        !<Loop variable.
-        integer(INT_KIND)                          :: j                                        !<Loop variable.
-        integer(INT_KIND)                          :: k                                        !<Loop variable.
-        integer(INT_KIND)                          :: q                                        !<Loop variable.
+        integer(i8_kind)                         :: read_in_chksum                           !<Check-sum for read in data.
+        integer(i8_kind)                         :: ref_chksum                               !<Check-sum for reference test data.
+        integer(i4_kind)                          :: funit                                    !<File unit used to close a file.
+        integer(i4_kind)                          :: i                                        !<Loop variable.
+        integer(i4_kind)                          :: j                                        !<Loop variable.
+        integer(i4_kind)                          :: k                                        !<Loop variable.
+        integer(i4_kind)                          :: q                                        !<Loop variable.
 
        !Print out a message that the test is starting.
         if (mpp_pe() .eq. mpp_root_pe()) then
diff --git a/test_fms/fms2_io/Makefile.am b/test_fms/fms2_io/Makefile.am
index 454c65e8d3..0a1c9f646b 100644
--- a/test_fms/fms2_io/Makefile.am
+++ b/test_fms/fms2_io/Makefile.am
@@ -23,7 +23,8 @@
 # J.Liptak, R.Menzel, U.Ramirez
 
 # Find the fms_mod.mod file.
-AM_CPPFLAGS = -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_builddir}/.mod \
+  -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
@@ -47,12 +48,20 @@ test_get_mosaic_tile_grid_SOURCES=test_get_mosaic_tile_grid.F90
 
 EXTRA_DIST = test_fms2_io.sh test_atmosphere_io.sh test_io_simple.sh test_global_att.sh test_io_with_mask.sh
 
-argparse.mod: argparse.$(OBJEXT)
-setup.mod: setup.$(OBJEXT)
-setup.$(OBJEXT): argparse.mod
-test_atmosphere_io.$(OBJEXT): setup.mod
-test_io_simple.$(OBJEXT): setup.mod
-test_fms2_io.$(OBJEXT): argparse.mod
+argparse.mod: argparse.lo
+setup.mod: setup.lo
+setup.lo: argparse.mod
+test_atmosphere_io.lo: setup.mod
+test_io_simple.lo: setup.mod
+test_fms2_io.lo: argparse.mod
+
+# Mod files are built and then installed as headers.
+MODFILES = argparse.mod \
+	setup.mod
+
+BUILT_SOURCES = $(MODFILES)
+include_HEADERS = $(MODFILES)
+
 
 # Run the test program.
 TESTS = test_fms2_io.sh test_atmosphere_io.sh test_io_simple.sh test_io_with_mask.sh test_global_att.sh
diff --git a/test_fms/fms2_io/atmosphere_restart_file_test.inc b/test_fms/fms2_io/atmosphere_restart_file_test.inc
index 01e631ae20..430bb9299d 100644
--- a/test_fms/fms2_io/atmosphere_restart_file_test.inc
+++ b/test_fms/fms2_io/atmosphere_restart_file_test.inc
@@ -36,17 +36,17 @@ subroutine atmosphere_restart_file(domain, nz, nt, debug_flag)
   integer :: nyd
   integer :: ioff
   integer :: joff
-  real(kind=real64), dimension(:), allocatable :: lon_data
-  real(kind=real64), dimension(:), allocatable :: lat_data
-  real(kind=real64), dimension(:), allocatable :: lev_data
-  real(kind=real64), dimension(:,:,:), allocatable :: temperature_data
-  real(kind=real64), dimension(:,:,:), allocatable :: temperature_data_with_halos
-  real(kind=real64), dimension(:), allocatable :: avg_t_data
-  integer(kind=int32), dimension(:,:), allocatable :: surf_t_data
+  real(kind=r8_kind), dimension(:), allocatable :: lon_data
+  real(kind=r8_kind), dimension(:), allocatable :: lat_data
+  real(kind=r8_kind), dimension(:), allocatable :: lev_data
+  real(kind=r8_kind), dimension(:,:,:), allocatable :: temperature_data
+  real(kind=r8_kind), dimension(:,:,:), allocatable :: temperature_data_with_halos
+  real(kind=r8_kind), dimension(:), allocatable :: avg_t_data
+  integer(kind=i4_kind), dimension(:,:), allocatable :: surf_t_data
   integer :: ndims
   integer, dimension(:), allocatable :: dim_sizes
-  integer(kind=int64), dimension(3) :: out_chksum
-  integer(kind=int64), dimension(3) :: in_chksum
+  integer(kind=i8_kind), dimension(3) :: out_chksum
+  integer(kind=i8_kind), dimension(3) :: in_chksum
   integer, dimension(3) :: chksum_unlim_level
   character(len=256) :: att
   character(len=256), dimension(:), allocatable :: buffer
@@ -95,10 +95,10 @@ subroutine atmosphere_restart_file(domain, nz, nt, debug_flag)
                             domain, nc_format="64bit", is_restart=.true.))
 
   !Add some global attributes to the restart file.
-  call register_global_attribute(fileobj, "globalatt1", real(7., kind=real64))
-  call register_global_attribute(fileobj, "globalatt2", real(4., kind=real32))
-  call register_global_attribute(fileobj, "globalatt3", int(3, kind=int32))
-  call register_global_attribute(fileobj, "globalatt4", int(2, kind=int64))
+  call register_global_attribute(fileobj, "globalatt1", real(7., kind=r8_kind))
+  call register_global_attribute(fileobj, "globalatt2", real(4., kind=r4_kind))
+  call register_global_attribute(fileobj, "globalatt3", int(3, kind=i4_kind))
+  call register_global_attribute(fileobj, "globalatt4", int(2, kind=i8_kind))
   call register_global_attribute(fileobj, "globalatt5", "some text", str_len=len_trim("some text"))
 
   !Add dimensions and corresponding variables to the file.
@@ -169,6 +169,7 @@ subroutine atmosphere_restart_file(domain, nz, nt, debug_flag)
 
   !Add a restart variable.
   allocate(temperature_data(nx, ny, nz))
+  temperature_data(:,:,:) = 0.0
   call register_restart_field(fileobj, "temperature", temperature_data, &
                               (/"lon ", "lat ", "lev ", "time"/))
   call register_variable_attribute(fileobj, "temperature", "units", "K", str_len=len_trim("k"))
@@ -197,6 +198,7 @@ subroutine atmosphere_restart_file(domain, nz, nt, debug_flag)
 
   !Add a variable whose user-level buffers include space for halos.
   allocate(temperature_data_with_halos(nxd, nyd, nz))
+  temperature_data(:,:,:) = 0.0
   call register_restart_field(fileobj, "temperature_ignore_halos", &
                               temperature_data_with_halos, &
                               (/"lon ", "lat ", "lev ", "time"/))
@@ -257,7 +259,7 @@ subroutine atmosphere_restart_file(domain, nz, nt, debug_flag)
     enddo
 
     !Write out the times of each "time step".
-    call write_data(fileobj, "time", 1950. + real(i,kind=real32), unlim_dim_level=i)
+    call write_data(fileobj, "time", 1950. + real(i,kind=r4_kind), unlim_dim_level=i)
   enddo
 
   !Write out the restart file.
@@ -322,9 +324,9 @@ subroutine atmosphere_restart_file(domain, nz, nt, debug_flag)
   allocate(dim_sizes(ndims))
   call get_variable_size(fileobj, "temperature", dim_sizes, broadcast=.true.)
   allocate(temperature_data(nx, ny, dim_sizes(3)))
-  temperature_data(:,:,:) = 0.
+  temperature_data(:,:,:) = 0.0
   allocate(temperature_data_with_halos(nxd, nyd, dim_sizes(3)))
-  temperature_data_with_halos(:,:,:) = 0.
+  temperature_data_with_halos(:,:,:) = 0.0
   deallocate(dim_sizes)
 
   ndims = get_variable_num_dimensions(fileobj, "avg_layer_temperature", broadcast=.true.)
diff --git a/test_fms/fms2_io/ocean_restart_file_test.inc b/test_fms/fms2_io/ocean_restart_file_test.inc
index f890bfd7a7..e3a3d8fd34 100644
--- a/test_fms/fms2_io/ocean_restart_file_test.inc
+++ b/test_fms/fms2_io/ocean_restart_file_test.inc
@@ -28,14 +28,14 @@ subroutine ocean_restart_file(domain, nz, nt, debug_flag)
   type(domain2d), pointer :: io_domain
   integer :: nx
   integer :: ny
-  real(kind=real64), dimension(:), allocatable :: lon_data
-  real(kind=real64), dimension(:), allocatable :: lat_data
-  real(kind=real64), dimension(:), allocatable :: lev_data
-  real(kind=real64), dimension(:,:,:), allocatable :: temperature_data
-  integer(kind=int64) :: out_chksum
+  real(kind=r8_kind), dimension(:), allocatable :: lon_data
+  real(kind=r8_kind), dimension(:), allocatable :: lat_data
+  real(kind=r8_kind), dimension(:), allocatable :: lev_data
+  real(kind=r8_kind), dimension(:,:,:), allocatable :: temperature_data
+  integer(kind=i8_kind) :: out_chksum
   integer :: ndims
   integer, dimension(:), allocatable :: dim_sizes
-  integer(kind=int64) :: in_chksum
+  integer(kind=i8_kind) :: in_chksum
   logical :: debug
   integer :: err
   integer :: i
@@ -68,6 +68,7 @@ subroutine ocean_restart_file(domain, nz, nt, debug_flag)
   endif
   call mpp_get_compute_domain(io_domain, xsize=nx, ysize=ny)
   allocate(temperature_data(nz, nx, ny))
+  temperature_data(:,:,:) = 0.0
 
   !Write a restart file.
   !Open a restart file and initialize the file object.
@@ -75,10 +76,10 @@ subroutine ocean_restart_file(domain, nz, nt, debug_flag)
                             nc_format="64bit", is_restart=.true.))
 
   !Add some global attributes to the restart file.
-  call register_global_attribute(fileobj, "globalatt1", real(7., kind=real64))
-  call register_global_attribute(fileobj, "globalatt2", real(4., kind=real32))
-  call register_global_attribute(fileobj, "globalatt3", int(3, kind=int32))
-  call register_global_attribute(fileobj, "globalatt4", int(2, kind=int64))
+  call register_global_attribute(fileobj, "globalatt1", real(7., kind=r8_kind))
+  call register_global_attribute(fileobj, "globalatt2", real(4., kind=r4_kind))
+  call register_global_attribute(fileobj, "globalatt3", int(3, kind=i4_kind))
+  call register_global_attribute(fileobj, "globalatt4", int(2, kind=i8_kind))
   call register_global_attribute(fileobj, "globalatt5", "some text", str_len=len_trim("some text"))
 
   !Add dimensions and corresponding variables to the file.
diff --git a/test_fms/fms2_io/test_atmosphere_io.F90 b/test_fms/fms2_io/test_atmosphere_io.F90
index 3339b4eab5..b0ec80dfc4 100644
--- a/test_fms/fms2_io/test_atmosphere_io.F90
+++ b/test_fms/fms2_io/test_atmosphere_io.F90
@@ -24,6 +24,7 @@ program test_atmosphere_io
 use mpp_domains_mod
 use mpp_mod
 use setup
+use platform_mod
 
 
 type(Params) :: test_params
@@ -51,27 +52,27 @@ program test_atmosphere_io
 type(FmsNetcdfDomainFile_t) :: fileobjv
 integer, parameter :: ntiles = 6
 integer, parameter :: nt = 11
-real(kind=real64), dimension(:), allocatable :: double_buffer
-real(kind=real64), dimension(:,:), allocatable :: double_buffer2d
-real(kind=real64), dimension(:,:,:), allocatable :: var5
-real(kind=real64), dimension(:,:,:), allocatable :: var6
-real(kind=real64), dimension(:,:,:), allocatable :: var7
-real(kind=real64), dimension(:,:,:), allocatable :: var8
-real(kind=real64), dimension(:,:,:), allocatable :: var5p
-real(kind=real64), dimension(:,:,:), allocatable :: var6p
-real(kind=real64), dimension(:,:,:), allocatable :: var7p
-real(kind=real64), dimension(:,:,:), allocatable :: var8p
-real(kind=real64), dimension(:), allocatable :: var9
-integer(kind=int32), dimension(:,:), allocatable :: var10
-real(kind=real64), dimension(:,:,:), allocatable :: var11
-integer(kind=int64) :: var5_chksum
-integer(kind=int64) :: var6_chksum
-integer(kind=int64) :: var7_chksum
-integer(kind=int64) :: var8_chksum
-integer(kind=int64) :: var9_chksum
-integer(kind=int64) :: var10_chksum
-integer(kind=int64) :: var11_chksum
-integer(kind=int64) :: chksum
+real(kind=r8_kind), dimension(:), allocatable :: double_buffer
+real(kind=r8_kind), dimension(:,:), allocatable :: double_buffer2d
+real(kind=r8_kind), dimension(:,:,:), allocatable :: var5
+real(kind=r8_kind), dimension(:,:,:), allocatable :: var6
+real(kind=r8_kind), dimension(:,:,:), allocatable :: var7
+real(kind=r8_kind), dimension(:,:,:), allocatable :: var8
+real(kind=r8_kind), dimension(:,:,:), allocatable :: var5p
+real(kind=r8_kind), dimension(:,:,:), allocatable :: var6p
+real(kind=r8_kind), dimension(:,:,:), allocatable :: var7p
+real(kind=r8_kind), dimension(:,:,:), allocatable :: var8p
+real(kind=r8_kind), dimension(:), allocatable :: var9
+integer(kind=i4_kind), dimension(:,:), allocatable :: var10
+real(kind=r8_kind), dimension(:,:,:), allocatable :: var11
+integer(kind=i8_kind) :: var5_chksum
+integer(kind=i8_kind) :: var6_chksum
+integer(kind=i8_kind) :: var7_chksum
+integer(kind=i8_kind) :: var8_chksum
+integer(kind=i8_kind) :: var9_chksum
+integer(kind=i8_kind) :: var10_chksum
+integer(kind=i8_kind) :: var11_chksum
+integer(kind=i8_kind) :: chksum
 character(len=256), dimension(:), allocatable :: string_buffer
 character(len=256), dimension(:), allocatable :: string_buffer2
 character(len=256) :: string_buffer3
@@ -117,15 +118,15 @@ program test_atmosphere_io
 call open_check(open_virtual_file(fileobjv, domain, "atm.nc"))
 
 !Add some global attributes to the restart file.
-call register_global_attribute(fileobj, "globalatt1", real(7., kind=real64))
-call register_global_attribute(fileobj, "globalatt2", real(4., kind=real32))
-call register_global_attribute(fileobj, "globalatt3", int(3, kind=int32))
-call register_global_attribute(fileobj, "globalatt4", int(2, kind=int64))
+call register_global_attribute(fileobj, "globalatt1", real(7., kind=r8_kind))
+call register_global_attribute(fileobj, "globalatt2", real(4., kind=r4_kind))
+call register_global_attribute(fileobj, "globalatt3", int(3, kind=i4_kind))
+call register_global_attribute(fileobj, "globalatt4", int(2, kind=i8_kind))
 call register_global_attribute(fileobj, "globalatt5", "some text", str_len=9)
-call register_global_attribute(fileobjv, "globalatt1", real(7., kind=real64))
-call register_global_attribute(fileobjv, "globalatt2", real(4., kind=real32))
-call register_global_attribute(fileobjv, "globalatt3", int(3, kind=int32))
-call register_global_attribute(fileobjv, "globalatt4", int(2, kind=int64))
+call register_global_attribute(fileobjv, "globalatt1", real(7., kind=r8_kind))
+call register_global_attribute(fileobjv, "globalatt2", real(4., kind=r4_kind))
+call register_global_attribute(fileobjv, "globalatt3", int(3, kind=i4_kind))
+call register_global_attribute(fileobjv, "globalatt4", int(2, kind=i8_kind))
 call register_global_attribute(fileobjv, "globalatt5", "some text", str_len=9)
 
 !Add dimensions and corresponding variables to the file.
@@ -326,7 +327,7 @@ program test_atmosphere_io
 
 !Perform a "simulation" and write restart data.
 do i = 1, nt
-  call write_data(fileobj, "time", real(i, kind=real32), unlim_dim_level=i)
+  call write_data(fileobj, "time", real(i, kind=r4_kind), unlim_dim_level=i)
   call create_data(var5)
   call create_data(var6)
   call create_data(var7)
diff --git a/test_fms/fms2_io/test_fms2_io.F90 b/test_fms/fms2_io/test_fms2_io.F90
index 6040fc265e..bd45d23f45 100644
--- a/test_fms/fms2_io/test_fms2_io.F90
+++ b/test_fms/fms2_io/test_fms2_io.F90
@@ -24,6 +24,7 @@ program main
 use mpp_mod
 use mpp_domains_mod
 use fms2_io_mod
+use platform_mod
 implicit none
 
 character(len=8), parameter :: green = achar(27)//"[1;32m"
diff --git a/test_fms/fms2_io/test_get_mosaic_tile_grid.F90 b/test_fms/fms2_io/test_get_mosaic_tile_grid.F90
index 6c5e25d472..00e8abc284 100644
--- a/test_fms/fms2_io/test_get_mosaic_tile_grid.F90
+++ b/test_fms/fms2_io/test_get_mosaic_tile_grid.F90
@@ -26,8 +26,11 @@ program test_get_mosaic_tile_grid
 use   netcdf,          only: nf90_create, nf90_def_dim, nf90_clobber, nf90_64bit_offset, &
                              nf90_def_var, nf90_enddef, nf90_put_var, nf90_close, nf90_char, &
                              NF90_NOERR
+use   mpp_domains_mod, only: domain2d, mpp_define_mosaic, mpp_define_io_domain
+use   mpp_mod,         only: mpp_npes, mpp_pe, mpp_root_pe, mpp_error, FATAL
 use   mpi,             only: mpi_barrier, mpi_comm_world
 
+
 implicit none
 
 type(domain2d)               :: domain             !< 6 tile cube sphere domain
diff --git a/test_fms/fms2_io/test_global_att.F90 b/test_fms/fms2_io/test_global_att.F90
index 2f6bfd42d1..824ed275ff 100644
--- a/test_fms/fms2_io/test_global_att.F90
+++ b/test_fms/fms2_io/test_global_att.F90
@@ -20,17 +20,17 @@
 program test_global_att
 use   fms2_io_mod
 use   mpp_mod
-use, intrinsic :: iso_fortran_env, only : real32, real64, int32, int64
+use   platform_mod
 
 type(FmsNetcdfFile_t) :: fileobj  !< fms2io netcd file obj
-real(kind=real64) :: buf_real64 !< real64 buffer
-real(kind=real64), dimension(2) :: buf_real64_1d !< real64 1D buffer
-real(kind=real32) :: buf_real32 !< real32 buffer
-real(kind=real32), dimension(2)  :: buf_real32_1d !< real32 1D buffer
-integer(kind=int32)   :: buf_int32 !< int32 buffer
-integer(kind=int32), dimension(2)    :: buf_int32_1d !< int32 1D buffer
-integer(kind=int64)   :: buf_int64 !< int64 buffer
-integer(kind=int64), dimension(2)    :: buf_int64_1d !< int64 1D buffer
+real(kind=r8_kind) :: buf_r8_kind !< r8_kind buffer
+real(kind=r8_kind), dimension(2) :: buf_r8_kind_1d !< r8_kind 1D buffer
+real(kind=r4_kind) :: buf_r4_kind !< r4_kind buffer
+real(kind=r4_kind), dimension(2)  :: buf_r4_kind_1d !< r4_kind 1D buffer
+integer(kind=i4_kind)   :: buf_i4_kind !< i4_kind buffer
+integer(kind=i4_kind), dimension(2)    :: buf_i4_kind_1d !< i4_kind 1D buffer
+integer(kind=i8_kind)   :: buf_i8_kind !< i8_kind buffer
+integer(kind=i8_kind), dimension(2)    :: buf_i8_kind_1d !< i8_kind 1D buffer
 
 character(len=20) :: buf_str !< character buffer
 
@@ -39,17 +39,17 @@ program test_global_att
 
 !> Write out the different possible global attributes to a netcdf file
 if (open_file(fileobj, "test_global_att.nc", "overwrite")) then
-   call register_global_attribute(fileobj, "buf_real64", real(7., kind=real64))
-   call register_global_attribute(fileobj, "buf_real64_1d", (/ real(7., kind=real64), real(9., kind=real64) /))
+   call register_global_attribute(fileobj, "buf_r8_kind", real(7., kind=r8_kind))
+   call register_global_attribute(fileobj, "buf_r8_kind_1d", (/ real(7., kind=r8_kind), real(9., kind=r8_kind) /))
 
-   call register_global_attribute(fileobj, "buf_real32", real(4., kind=real32))
-   call register_global_attribute(fileobj, "buf_real32_1d", (/ real(4., kind=real32), real(6., kind=real32)/) )
+   call register_global_attribute(fileobj, "buf_r4_kind", real(4., kind=r4_kind))
+   call register_global_attribute(fileobj, "buf_r4_kind_1d", (/ real(4., kind=r4_kind), real(6., kind=r4_kind)/) )
 
-   call register_global_attribute(fileobj, "buf_int32", int(3, kind=int32))
-   call register_global_attribute(fileobj, "buf_int32_1d", (/ int(3, kind=int32), int(5, kind=int32) /) )
+   call register_global_attribute(fileobj, "buf_i4_kind", int(3, kind=i4_kind))
+   call register_global_attribute(fileobj, "buf_i4_kind_1d", (/ int(3, kind=i4_kind), int(5, kind=i4_kind) /) )
 
-   call register_global_attribute(fileobj, "buf_int64", int(2, kind=int64))
-   call register_global_attribute(fileobj, "buf_int64_1d", (/ int(2, kind=int64), int(4, kind=int64) /) )
+   call register_global_attribute(fileobj, "buf_i8_kind", int(2, kind=i8_kind))
+   call register_global_attribute(fileobj, "buf_i8_kind_1d", (/ int(2, kind=i8_kind), int(4, kind=i8_kind) /) )
 
    call register_global_attribute(fileobj, "buf_str", "some text"//char(0), str_len=10)
 
@@ -60,17 +60,17 @@ program test_global_att
 
 !> Read the global attributes from the netcdf file
 if (open_file(fileobj, "test_global_att.nc", "read")) then
-   call get_global_attribute(fileobj, "buf_real64", buf_real64)
-   call get_global_attribute(fileobj, "buf_real64_1d", buf_real64_1d)
+   call get_global_attribute(fileobj, "buf_r8_kind", buf_r8_kind)
+   call get_global_attribute(fileobj, "buf_r8_kind_1d", buf_r8_kind_1d)
 
-   call get_global_attribute(fileobj, "buf_real32", buf_real32)
-   call get_global_attribute(fileobj, "buf_real32_1d", buf_real32_1d)
+   call get_global_attribute(fileobj, "buf_r4_kind", buf_r4_kind)
+   call get_global_attribute(fileobj, "buf_r4_kind_1d", buf_r4_kind_1d)
 
-   call get_global_attribute(fileobj, "buf_int32", buf_int32)
-   call get_global_attribute(fileobj, "buf_int32_1d", buf_int32_1d)
+   call get_global_attribute(fileobj, "buf_i4_kind", buf_i4_kind)
+   call get_global_attribute(fileobj, "buf_i4_kind_1d", buf_i4_kind_1d)
 
-   call get_global_attribute(fileobj, "buf_int64", buf_int64)
-   call get_global_attribute(fileobj, "buf_int64_1d", buf_int64_1d)
+   call get_global_attribute(fileobj, "buf_i8_kind", buf_i8_kind)
+   call get_global_attribute(fileobj, "buf_i8_kind_1d", buf_i8_kind_1d)
 
    call get_global_attribute(fileobj, "buf_str", buf_str)
 
@@ -80,21 +80,21 @@ program test_global_att
 endif
 
 !> Compares the values read with the expected values
-if (buf_real64 /= real(7., kind=real64)) call mpp_error(FATAL, "test_global_att: error reading buf_real64")
-if (buf_real64_1d(1) /= real(7., kind=real64) .or. buf_real64_1d(2) /= real(9., kind=real64)) &
-     call mpp_error(FATAL, "test_global_att: error reading buf_real64_1d")
+if (buf_r8_kind /= real(7., kind=r8_kind)) call mpp_error(FATAL, "test_global_att: error reading buf_r8_kind")
+if (buf_r8_kind_1d(1) /= real(7., kind=r8_kind) .or. buf_r8_kind_1d(2) /= real(9., kind=r8_kind)) &
+     call mpp_error(FATAL, "test_global_att: error reading buf_r8_kind_1d")
 
-if (buf_real32 /= real(4., kind=real32)) call mpp_error(FATAL, "test_global_att: error reading buf_real32")
-if (buf_real32_1d(1) /= real(4., kind=real32) .or. buf_real32_1d(2) /= real(6., kind=real32)) &
-    call mpp_error(FATAL, "test_global_att: error reading buf_real32_1d")
+if (buf_r4_kind /= real(4., kind=r4_kind)) call mpp_error(FATAL, "test_global_att: error reading buf_r4_kind")
+if (buf_r4_kind_1d(1) /= real(4., kind=r4_kind) .or. buf_r4_kind_1d(2) /= real(6., kind=r4_kind)) &
+    call mpp_error(FATAL, "test_global_att: error reading buf_r4_kind_1d")
 
-if (buf_int32 /= int(3, kind=int32)) call mpp_error(FATAL, "test_global_att: error reading buf_int32")
-if (buf_int32_1d(1) /= int(3, kind=int32) .or. buf_int32_1d(2) /= int(5, kind=int32)) &
-    call mpp_error(FATAL, "test_global_att: error reading buf_int32_1d")
+if (buf_i4_kind /= int(3, kind=i4_kind)) call mpp_error(FATAL, "test_global_att: error reading buf_i4_kind")
+if (buf_i4_kind_1d(1) /= int(3, kind=i4_kind) .or. buf_i4_kind_1d(2) /= int(5, kind=i4_kind)) &
+    call mpp_error(FATAL, "test_global_att: error reading buf_i4_kind_1d")
 
-if (buf_int64 /= int(2, kind=int64)) call mpp_error(FATAL, "test_global_att: error reading buf_int64")
-if (buf_int64_1d(1) /= int(2, kind=int64) .or. buf_int64_1d(2) /= int(4, kind=int64)) &
-    call mpp_error(FATAL, "test_global_att: error reading buf_int64_1d")
+if (buf_i8_kind /= int(2, kind=i8_kind)) call mpp_error(FATAL, "test_global_att: error reading buf_i8_kind")
+if (buf_i8_kind_1d(1) /= int(2, kind=i8_kind) .or. buf_i8_kind_1d(2) /= int(4, kind=i8_kind)) &
+    call mpp_error(FATAL, "test_global_att: error reading buf_i8_kind_1d")
 
 if (trim(buf_str) /= "some text") then
     print *, "buf_str read in = ", trim(buf_str)
diff --git a/test_fms/fms2_io/test_io_simple.F90 b/test_fms/fms2_io/test_io_simple.F90
index 32a4298745..079d10ed58 100644
--- a/test_fms/fms2_io/test_io_simple.F90
+++ b/test_fms/fms2_io/test_io_simple.F90
@@ -30,6 +30,7 @@ program test_io_simple
   use mpp_mod
   use setup
   use netcdf
+  use platform_mod
   implicit none
 
   type(Params) :: test_params  !> Some test parameters.
@@ -59,7 +60,7 @@ program test_io_simple
   integer :: ncid                        !> File ID for checking file.
   character (len = 80) :: testfile       !> Base name for file created in test.
   integer :: numfilesatt                 !> Value for global att in test file.
-  real (kind=real64) :: att1             !> Value for global att in test file.
+  real (kind=r8_kind) :: att1             !> Value for global att in test file.
   character (len = 120), dimension(3) :: my_format !> Array of formats to try.
   character (len = 6), dimension(4) :: names !> Dim name.
   character (len = 6) :: dimname !> Dim name we will read in.
@@ -69,8 +70,8 @@ program test_io_simple
   integer, dimension(1) :: dimids !> More var info we will read in.
   integer :: nAtts !> More var info we will read in.
   integer, dimension(4) :: domain_decomposition !> Domain decomposition we will read.
-  real (kind = real64), dimension(96) :: double_buffer !> Data we will write.
-  real (kind = real64), dimension(96) :: double_buffer_in !> Data we will read to check.
+  real (kind = r8_kind), dimension(96) :: double_buffer !> Data we will write.
+  real (kind = r8_kind), dimension(96) :: double_buffer_in !> Data we will read to check.
   integer :: i    !> Index for do loop.
   integer :: j    !> Index for do loop.
   integer :: err  !> Return code.
@@ -113,7 +114,7 @@ program test_io_simple
           domain, nc_format=my_format(1), is_restart=.false.))
 
      ! Add a global attribute.
-     call register_global_attribute(fileobj, "globalatt1", real(7., kind=real64))
+     call register_global_attribute(fileobj, "globalatt1", real(7., kind=r8_kind))
 
      ! Add a dimension.
      call register_axis(fileobj, "lon", "x")
diff --git a/test_fms/fms2_io/test_io_with_mask.F90 b/test_fms/fms2_io/test_io_with_mask.F90
index 6d332573d0..9c38c77954 100644
--- a/test_fms/fms2_io/test_io_with_mask.F90
+++ b/test_fms/fms2_io/test_io_with_mask.F90
@@ -35,7 +35,7 @@ program test_io_with_mask
 use   netcdf,          only: nf90_open, nf90_get_var, nf90_nowrite, NF90_NOERR, nf90_get_var, &
                              nf90_close
 use   mpi,             only: mpi_barrier, mpi_comm_world
-use,  intrinsic :: iso_fortran_env, only : real64
+use   platform_mod
 
 implicit none
 
@@ -45,9 +45,9 @@ program test_io_with_mask
 type(domain2d)                        :: Domain           !< Domain with mask table
 real, dimension(:), allocatable       :: x                !< x axis data
 real, dimension(:), allocatable       :: y                !< y axis data
-real(kind=real64), allocatable, dimension(:,:) :: sst     !< Data to be written
-real(kind=real64), allocatable, dimension(:,:) :: sst_in  !< Buffer where data will be read with netcdf
-real(kind=real64), allocatable, dimension(:,:) :: sst_in2 !< Buffer where data will be read with fms2io
+real(kind=r8_kind), allocatable, dimension(:,:) :: sst     !< Data to be written
+real(kind=r8_kind), allocatable, dimension(:,:) :: sst_in  !< Buffer where data will be read with netcdf
+real(kind=r8_kind), allocatable, dimension(:,:) :: sst_in2 !< Buffer where data will be read with fms2io
 logical, allocatable, dimension(:,:)  :: parsed_mask      !< Parsed masked
 character(len=6), dimension(2)        :: names            !< Dimensions names
 type(FmsNetcdfDomainFile_t)           :: fileobj          !< fms2io fileobj for domain decomposed
@@ -86,7 +86,7 @@ program test_io_with_mask
   y(j) = j
 enddo
 
-sst = real(7., kind=real64)
+sst = real(7., kind=r8_kind)
 
 !< Open a netCDF file and initialize the file object.
 if (open_file(fileobj, "test_io_with_mask.nc", "overwrite", domain)) then
@@ -98,7 +98,7 @@ program test_io_with_mask
 
     !< Register the variable and Write out the data
     call register_field(fileobj, "sst", "double", names(1:2))
-    call register_variable_attribute(fileobj, "sst", "_FillValue", real(999., kind=real64))
+    call register_variable_attribute(fileobj, "sst", "_FillValue", real(999., kind=r8_kind))
     call write_data(fileobj, "sst", sst)
 
     !< Close the file
@@ -117,17 +117,17 @@ program test_io_with_mask
    if (err .ne. NF90_NOERR) call mpp_error(FATAL, "test_io_with_mask: error reading from the file")
 
    !< x: 1-30 y: 1-20 are masked out, so sst_in for this values has to be equal to the fill value
-   !! For the other points the data should be equal to real(7., kind=real64)
+   !! For the other points the data should be equal to real(7., kind=r8_kind)
 
    do i=1,nlon
        do j=1,nlat
           if (i > 30 .or. j > 20) then
-             if (sst_in(i,j) .ne. real(7., kind=real64)) then
+             if (sst_in(i,j) .ne. real(7., kind=r8_kind)) then
                  print *, 'i=', i, ' j=', j, ' sst_in=', sst_in(i,j)
                  call mpp_error(FATAL, "test_io_with_mask: the unmasked data read in is not correct")
              endif
           else
-             if (sst_in(i,j) .ne. real(999., kind=real64)) then
+             if (sst_in(i,j) .ne. real(999., kind=r8_kind)) then
                  print *, 'i=', i, ' j=', j, ' sst_in=', sst_in(i,j)
                  call mpp_error(FATAL, "test_io_with_mask: the masked data read in is not correct")
              endif
@@ -158,7 +158,7 @@ program test_io_with_mask
 
    do i=is,ie
        do j=js,je
-             if (sst_in2(i,j) .ne. real(7., kind=real64)) then
+             if (sst_in2(i,j) .ne. real(7., kind=r8_kind)) then
                  print *, 'i=', i, ' j=', j, ' sst_in=', sst_in2(i,j)
                  call mpp_error(FATAL, "test_io_with_mask: the unmasked data read in is not correct")
              endif
diff --git a/test_fms/horiz_interp/Makefile.am b/test_fms/horiz_interp/Makefile.am
index 82ba1fe9e8..8672bda6cd 100644
--- a/test_fms/horiz_interp/Makefile.am
+++ b/test_fms/horiz_interp/Makefile.am
@@ -23,7 +23,8 @@
 # uramirez, Ed Hartnett
 
 # Find the needed mod files.
-AM_CPPFLAGS = -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_builddir}/.mod \
+  -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/interpolator/Makefile.am b/test_fms/interpolator/Makefile.am
index 5f5b8df71c..3363777d99 100644
--- a/test_fms/interpolator/Makefile.am
+++ b/test_fms/interpolator/Makefile.am
@@ -23,7 +23,8 @@
 # uramirez, Ed Hartnett
 
 # Find the fms_mod.mod file.
-AM_CPPFLAGS = -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_builddir}/.mod \
+  -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/monin_obukhov/Makefile.am b/test_fms/monin_obukhov/Makefile.am
index fbc771761f..63b2ce1581 100644
--- a/test_fms/monin_obukhov/Makefile.am
+++ b/test_fms/monin_obukhov/Makefile.am
@@ -23,7 +23,8 @@
 # uramirez, Ed Hartnett
 
 # Find the needed mod files.
-AM_CPPFLAGS = -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_builddir}/.mod \
+  -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/mosaic/Makefile.am b/test_fms/mosaic/Makefile.am
index 12e54deb71..d29cfd0854 100644
--- a/test_fms/mosaic/Makefile.am
+++ b/test_fms/mosaic/Makefile.am
@@ -23,7 +23,7 @@
 # uramirez, Ed Hartnett
 
 # Find the needed mod and include files.
-AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/.mod -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/mpp/Makefile.am b/test_fms/mpp/Makefile.am
index 49e96ced1d..b4005883b0 100644
--- a/test_fms/mpp/Makefile.am
+++ b/test_fms/mpp/Makefile.am
@@ -20,38 +20,56 @@
 # @uramirez, Ed Hartnett, @underwoo
 
 # Find the needed mod and inc files.
-AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_srcdir}/include \
+  -I${top_builddir}/mpp \
+  -I${top_builddir}/affinity \
+  -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
 
 # Build these test programs.
 check_PROGRAMS = test_mpp \
-  test_mpp_domains \
-  test_mpp_memuse \
-  test_mpp_mem_dump \
-  test_mpp_memutils_begin_end \
-  test_mpp_print_memuse_stats_stderr \
-  test_mpp_print_memuse_stats_file \
-  test_mpp_memutils_begin_2x \
-  test_mpp_memutils_end_before_begin \
-  test_read_ascii_file \
-  test_read_input_nml \
-  test_stdout \
-  test_stderr \
-  test_mpp_get_ascii_lines \
-  test_system_clock \
-  test_mpp_broadcast \
-  test_clock_init \
-  test_domains_simple \
-  test_mpp_npes \
-  test_mpp_pe \
-  test_mpp_root_pe \
-  test_peset
+	test_mpp_domains \
+	test_redistribute_int \
+	test_mpp_memuse \
+	test_mpp_mem_dump \
+	test_mpp_memutils_begin_end \
+	test_mpp_print_memuse_stats_stderr \
+	test_mpp_print_memuse_stats_file \
+	test_mpp_memutils_begin_2x \
+	test_mpp_memutils_end_before_begin \
+	test_read_ascii_file \
+	test_read_input_nml \
+	test_stdout \
+	test_stderr \
+	test_mpp_get_ascii_lines \
+	test_system_clock \
+	test_mpp_broadcast \
+	test_clock_init \
+	test_domains_simple \
+	test_mpp_npes \
+	test_mpp_pe \
+	test_mpp_root_pe \
+	test_peset \
+	test_mpp_update_domains \
+	test_mpp_gatscat \
+	test_mpp_sum \
+	test_update_domains_performance \
+	test_minmax \
+	test_mpp_sendrecv \
+	test_global_arrays \
+	test_chksum_int \
+	test_mpp_update_domains_ad \
+	test_mpp_transmit \
+	test_mpp_alltoall \
+	test_mpp_global_field \
+	test_mpp_global_field_ug \
+  test_mpp_global_sum_ad
 
 # These are the sources for the tests.
 test_mpp_SOURCES = test_mpp.F90
-test_mpp_domains_SOURCES = test_mpp_domains.F90
+test_mpp_domains_SOURCES = test_mpp_domains.F90 compare_data_checksums.F90 test_domains_utility_mod.F90
 test_mpp_memuse_SOURCES=test_mpp_memuse.F90
 test_mpp_mem_dump_SOURCES=test_mpp_mem_dump.F90
 test_mpp_memutils_begin_end_SOURCES=test_mpp_memutils_begin_end.F90
@@ -72,9 +90,38 @@ test_mpp_npes_SOURCES = test_mpp_npes.F90
 test_mpp_pe_SOURCES = test_mpp_pe.F90
 test_mpp_root_pe_SOURCES=test_mpp_root_pe.F90
 test_peset_SOURCES=test_peset.F90
+test_mpp_update_domains_SOURCES = test_mpp_update_domains_main.F90 \
+																	test_mpp_update_domains_real.F90 \
+																	test_mpp_update_domains_int.F90 \
+																	fill_halo.F90 \
+																	compare_data_checksums.F90 \
+																	compare_data_checksums_int.F90
+test_mpp_gatscat_SOURCES=test_mpp_gatscat.F90
+test_mpp_sendrecv_SOURCES=test_mpp_sendrecv.F90
+test_mpp_sum_SOURCES=test_mpp_sum.F90
+test_update_domains_performance_SOURCES=test_update_domains_performance.F90 \
+																				compare_data_checksums.F90 \
+																				compare_data_checksums_int.F90
+test_minmax_SOURCES=test_minmax.F90
+test_mpp_update_domains_ad_SOURCES=test_mpp_update_domains_ad.F90 compare_data_checksums.F90
+test_global_arrays_SOURCES=test_global_arrays.F90
+test_chksum_int_SOURCES=test_chksum_int.F90
+test_redistribute_int_SOURCES=test_redistribute_int.F90
+test_mpp_transmit_SOURCES=test_mpp_transmit.F90
+test_mpp_alltoall_SOURCES=test_mpp_alltoall.F90
+test_mpp_global_field_SOURCES=test_mpp_global_field.F90   \
+															compare_data_checksums.F90  \
+															compare_data_checksums_int.F90
+test_mpp_global_field_ug_SOURCES=test_mpp_global_field_ug.F90 \
+																 compare_data_checksums.F90 \
+																 compare_data_checksums_int.F90
+test_mpp_global_sum_ad_SOURCES=test_mpp_global_sum_ad.F90
+
 
 # Run the test programs.
 TESTS = test_mpp_domains2.sh \
+  test_redistribute_int.sh \
+  test_global_arrays.sh  \
   test_mpp2.sh \
   test_mpp_memuse \
   test_mpp_mem_dump \
@@ -88,34 +135,89 @@ TESTS = test_mpp_domains2.sh \
   test_mpp_broadcast.sh \
   test_clock_init.sh \
   test_mpp_npes.sh \
-  test_mpp_pe.sh \
-  test_mpp_root_pe.sh \
-  test_peset.sh
+	test_mpp_pe.sh \
+	test_mpp_root_pe.sh \
+	test_peset.sh \
+	test_mpp_update_domains.sh \
+	test_mpp_sum.sh \
+	test_mpp_gatscat.sh \
+	test_update_domains_performance.sh \
+	test_minmax.sh \
+	test_mpp_sendrecv.sh \
+	test_chksum_int.sh  \
+	test_mpp_update_domains_ad.sh \
+	test_mpp_transmit.sh \
+	test_mpp_alltoall.sh \
+	test_mpp_global_field.sh \
+  test_mpp_global_field_ug.sh \
+	test_mpp_global_sum_ad.sh
 
 # These files will also be included in the distribution.
 EXTRA_DIST = input_base.nml \
-  test_mpp_domains2.sh \
-  test_mpp2.sh \
-  test_mpp_memutils_mod.sh \
-  test_read_ascii_file.sh \
-  test_read_input_nml2.sh \
-  test_stdout.sh \
-  test_stderr.sh \
-  test_mpp_get_ascii_lines2.sh \
-  base_ascii_5 \
-  base_ascii_25 \
-  base_ascii_0 \
-  base_ascii_skip \
-  base_ascii_long \
-  test_system_clock.sh \
-  test_mpp_broadcast.sh \
-  test_clock_init.sh \
-  test_mpp_npes.sh \
-  test_mpp_pe.sh \
-  test_mpp_root_pe.sh \
-  test_peset.sh
+	test_mpp_domains2.sh \
+	test_mpp2.sh \
+	test_mpp_memutils_mod.sh \
+	test_read_ascii_file.sh \
+	test_read_input_nml2.sh \
+	test_stdout.sh \
+	test_stderr.sh \
+	test_mpp_get_ascii_lines2.sh \
+	base_ascii_5 \
+	base_ascii_25 \
+	base_ascii_0 \
+	base_ascii_skip \
+	base_ascii_long \
+	test_system_clock.sh \
+	test_mpp_broadcast.sh \
+	test_clock_init.sh \
+	test_mpp_npes.sh \
+	test_mpp_pe.sh \
+	test_mpp_root_pe.sh \
+	test_peset.sh \
+	test_mpp_update_domains.sh \
+	test_mpp_sum.sh \
+	test_mpp_gatscat.sh \
+	test_update_domains_performance.sh \
+	test_minmax.sh \
+	test_mpp_sendrecv.sh \
+	test_global_arrays.sh \
+	test_chksum_int.sh \
+	test_redistribute_int.sh \
+	test_mpp_update_domains_ad.sh \
+	test_mpp_transmit.sh \
+	test_mpp_alltoall.sh \
+	test_mpp_global_field.sh \
+  test_mpp_global_field_ug.sh \
+	test_mpp_global_sum_ad.sh
+
+# Each mod file depends on the .lo (library object) file.
+compare_data_checksums.mod : compare_data_checksums.lo
+compare_data_checksums_int.mod : compare_data_checksums_int.lo
+fill_halo.mod : fill_halo.lo
+test_mpp_update_domains_real.mod : test_mpp_update_domains_real.lo
+test_mpp_update_domains_int.mod : test_mpp_update_domains_int.lo
+test_domains_utility_mod.mod : test_domains_utility_mod.lo
+
+# Some mods are dependant on other mods in this directory
+test_mpp_update_domains_main.lo : test_mpp_update_domains_real.mod test_mpp_update_domains_int.mod
+test_mpp_update_domains_real.lo : fill_halo.mod compare_data_checksums.mod
+test_mpp_update_domains_int.lo: fill_halo.mod compare_data_checksums_int.mod
+test_update_domains_performance.lo : compare_data_checksums.mod compare_data_checksums_int.mod
+test_mpp_update_domains_ad.lo : compare_data_checksums.mod
+test_mpp_domains.lo : compare_data_checksums.mod test_domains_utility_mod.mod
+test_mpp_global_field.lo : compare_data_checksums.mod compare_data_checksums_int.mod
+test_mpp_global_field_ug.lo : compare_data_checksums.mod compare_data_checksums_int.mod
+
+# Mod files are built and then installed as headers.
+MODFILES = compare_data_checksums.mod \
+	compare_data_checksums_int.mod \
+	fill_halo.mod \
+	test_mpp_update_domains_real.mod \
+	test_mpp_update_domains_int.mod \
+	test_domains_utility_mod.mod
+BUILT_SOURCES = $(MODFILES)
+include_HEADERS = $(MODFILES)
 
 # Clean up
 CLEANFILES = input.nml input_alternative.nml input_blank.nml empty.nml *.out* *.tst* \
-             include_files_mod.mod ascii* test_numb*
-
+						 ascii* test_numb* *.mod
diff --git a/test_fms/mpp/ascii_0 b/test_fms/mpp/ascii_0
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_fms/mpp/ascii_25 b/test_fms/mpp/ascii_25
new file mode 100644
index 0000000000..ac81a22b2b
--- /dev/null
+++ b/test_fms/mpp/ascii_25
@@ -0,0 +1,25 @@
+"this is an ascii file with 5 lines"
+"it will contain commas inside quotes", "as well as outside quotes"
+"it will not have the same number of fields on every line"
+some lines will not have quotes
+"there might be a line with the character string \n"
+"this is an ascii file with 5 lines"
+"it will contain commas inside quotes", "as well as outside quotes"
+"it will not have the same number of fields on every line"
+some lines will not have quotes
+"there might be a line with the character string \n"
+"this is an ascii file with 5 lines"
+"it will contain commas inside quotes", "as well as outside quotes"
+"it will not have the same number of fields on every line"
+some lines will not have quotes
+"there might be a line with the character string \n"
+"this is an ascii file with 5 lines"
+"it will contain commas inside quotes", "as well as outside quotes"
+"it will not have the same number of fields on every line"
+some lines will not have quotes
+"there might be a line with the character string \n"
+"this is an ascii file with 5 lines"
+"it will contain commas inside quotes", "as well as outside quotes"
+"it will not have the same number of fields on every line"
+some lines will not have quotes
+"there might be a line with the character string \n"
diff --git a/test_fms/mpp/ascii_5 b/test_fms/mpp/ascii_5
new file mode 100644
index 0000000000..e1f183a39a
--- /dev/null
+++ b/test_fms/mpp/ascii_5
@@ -0,0 +1,5 @@
+"this is an ascii file with 5 lines"
+"it will contain commas inside quotes", "as well as outside quotes"
+"it will not have the same number of fields on every line"
+some lines will not have quotes
+"there might be a line with the character string \n"
diff --git a/test_fms/mpp/ascii_long b/test_fms/mpp/ascii_long
new file mode 100644
index 0000000000..9ba15c00d3
--- /dev/null
+++ b/test_fms/mpp/ascii_long
@@ -0,0 +1,5 @@
+"this is an ascii file with 5 lines"
+""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes""it will contain commas inside quotes", "as well as outside quotes"it will contain commas inside quotes", "as well as outside quotes"
+"it will not have the same number of fields on every line"
+some lines will not have quotes
+"there might be a line with the character string \n"
diff --git a/test_fms/mpp/ascii_skip b/test_fms/mpp/ascii_skip
new file mode 100644
index 0000000000..350bbf96b4
--- /dev/null
+++ b/test_fms/mpp/ascii_skip
@@ -0,0 +1,5 @@
+"this is an ascii file with 5 lines"
+"it will contain commas inside quotes", "as well as outside quotes"
+"it will not have the same number of fields on every line"
+
+"there might be a blank line beforehand"
diff --git a/test_fms/mpp/compare_data_checksums.F90 b/test_fms/mpp/compare_data_checksums.F90
new file mode 100644
index 0000000000..c7bedcd7ed
--- /dev/null
+++ b/test_fms/mpp/compare_data_checksums.F90
@@ -0,0 +1,207 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Jessica Liptak
+!> @brief Compare the checksums of 2D and 3D 32-bit or 64-bit real and integer arrays
+module compare_data_checksums
+
+use mpp_mod, only : mpp_root_pe, mpp_chksum, mpp_error, mpp_sync_self, mpp_pe
+use mpp_mod, only : FATAL, NOTE
+use platform_mod
+
+implicit none
+private
+
+
+integer :: stdunit = 6
+
+public :: compare_checksums
+
+interface compare_checksums
+  module procedure compare_checksums_2D_r4
+  module procedure compare_checksums_3D_r4
+  module procedure compare_checksums_2D_r8
+  module procedure compare_checksums_3D_r8
+end interface compare_checksums
+
+contains
+
+  !> compare checksums of 2D 32-bit real arrays
+  subroutine compare_checksums_2D_r4( a, b, chk_str )
+    real(kind=r4_kind), intent(in), dimension(:,:) :: a, b !< 2D arrays to compare
+    character(len=*), intent(in) :: chk_str
+    integer(kind=i8_kind) :: sum1, sum2
+    integer :: i, j
+    integer :: pe
+    !> @note can't call mpp_sync here since there might be different number of tiles on each pe.
+    call mpp_sync_self()
+    pe = mpp_pe()
+
+    if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) ) &
+      call mpp_error(FATAL,'compare_checksums_2D_r4: sizes of a and b do not match')
+
+    do j = 1, size(a,2)
+      do i = 1, size(a,1)
+        if(a(i,j) .ne. b(i,j)) then
+          print*, "a =", a(i,j)
+          print*, "b =", b(i,j)
+          write(*,'(a,i3,a,i3,a,i3,a,f20.9,a,f20.9)')"at the pe ", mpp_pe(), &
+                ", at point (",i,", ", j, "),a=", a(i,j), ",b=", b(i,j)
+          call mpp_error(FATAL, trim(chk_str)//': value mismatch at data point.')
+        endif
+      enddo
+    enddo
+
+    sum1 = mpp_chksum( a, (/pe/) )
+    sum2 = mpp_chksum( b, (/pe/) )
+
+    if( sum1.EQ.sum2 )then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(chk_str)//': OK.' )
+      !> @note in some cases, even though the checksum agree, the two arrays
+      !! actually are different [e.g.,(1.1,-1.2) with (-1.1,1.2)].
+      !! Thus, we need to check the values point-by-point.
+    else
+      call mpp_error( FATAL, trim(chk_str)//': checksums do not match.' )
+    end if
+  end subroutine compare_checksums_2D_r4
+
+  !> Compare the checksums of 2 3D 32-bit real arrays
+  subroutine compare_checksums_3D_r4( a, b, string )
+     real(kind=r4_kind), intent(in), dimension(:,:,:) :: a, b !< 3D 64-bit real arrays to compare
+     character(len=*), intent(in) :: string
+     integer(kind=i8_kind) :: sum1, sum2
+     integer :: i, j, k
+     integer :: pe
+    ! z1l can not call mpp_sync here since there might be different number of tiles on each pe.
+     call mpp_sync_self()
+     pe = mpp_pe()
+
+     if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) .or. size(a,3) .ne. size(b,3) ) &
+       call mpp_error(FATAL,'compare_checkums_3d_r4: sizes of a and b do not match')
+
+     do k = 1, size(a,3)
+       do j = 1, size(a,2)
+          do i = 1, size(a,1)
+             if(a(i,j,k) .ne. b(i,j,k)) then
+                write(*,'(a,i3,a,i3,a,i3,a,i3,a,f20.9,a,f20.9)') trim(string)//" at pe ", mpp_pe(), &
+                     ", at point (",i,", ", j, ", ", k, "), a = ", a(i,j,k), ", b = ", b(i,j,k)
+                call mpp_error(FATAL, trim(string)//': mismatch in checksums at data point.')
+             endif
+          enddo
+       enddo
+     enddo
+
+     sum1 = mpp_chksum( a, (/pe/) )
+     sum2 = mpp_chksum( b, (/pe/) )
+
+     if( sum1.EQ.sum2 )then
+       if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(string)//': OK.' )
+        !--- in some case, even though checksum agree, the two arrays
+        !    actually are different, like comparing (1.1,-1.2) with (-1.1,1.2)
+        !--- hence we need to check the value point by point.
+     else
+       write(stdunit, *)"sum1 =", sum1, mpp_pe()
+       write(stdunit, *)"sum2 =", sum2, mpp_pe()
+       write(stdunit,'(a,i3,a,i20,a,i20)')" at pe ", mpp_pe(), " sum(a)=", sum1, " sum(b)=", sum2
+       call mpp_error( FATAL, trim(string)//': checksums do not match.' )
+     end if
+  end subroutine compare_checksums_3D_r4
+
+  !> compare checksums of 2D 64-bit real arrays
+  subroutine compare_checksums_2D_r8( a, b, chk_str )
+    real(kind=r8_kind), intent(in), dimension(:,:) :: a, b !< 2D arrays to compare
+    character(len=*), intent(in) :: chk_str
+    integer(kind=i8_kind) :: sum1, sum2
+    integer :: i, j
+    integer :: pe
+
+    !> @note can't call mpp_sync here since there might be different number of tiles on each pe.
+    call mpp_sync_self()
+    pe = mpp_pe()
+
+    if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) ) &
+      call mpp_error(FATAL,'compare_checksums_2d_r8: sizes of a and b do not match')
+
+    do j = 1, size(a,2)
+      do i = 1, size(a,1)
+        if(a(i,j) .ne. b(i,j)) then
+          print*, "a =", a(i,j)
+          print*, "b =", b(i,j)
+          write(*,'(a,i3,a,i3,a,i3,a,f20.9,a,f20.9)')"at the pe ", mpp_pe(), &
+                ", at point (",i,", ", j, "),a=", a(i,j), ",b=", b(i,j)
+          call mpp_error(FATAL, trim(chk_str)//': value mismatch at data point.')
+        endif
+      enddo
+    enddo
+
+    sum1 = mpp_chksum( a, (/pe/) )
+    sum2 = mpp_chksum( b, (/pe/) )
+
+    if( sum1.EQ.sum2 )then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(chk_str)//': OK.' )
+      !> @note in some cases, even though the checksum agree, the two arrays
+      !! actually are different [e.g.,(1.1,-1.2) with (-1.1,1.2)].
+      !! Thus, we need to check the values point-by-point.
+    else
+      call mpp_error( FATAL, trim(chk_str)//': checksums do not match.' )
+    end if
+  end subroutine compare_checksums_2D_r8
+
+  !> Compare the checksums of 2 3D 64-bit real arrays
+  subroutine compare_checksums_3D_r8( a, b, string )
+     real(kind=r8_kind), intent(in), dimension(:,:,:) :: a, b !< 3D 64-bit real arrays to compare
+     character(len=*), intent(in) :: string
+     integer(kind=i8_kind) :: sum1, sum2
+     integer :: i, j, k
+     integer :: pe
+    ! z1l can not call mpp_sync here since there might be different number of tiles on each pe.
+     call mpp_sync_self()
+     pe = mpp_pe()
+
+     if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) .or. size(a,3) .ne. size(b,3) ) &
+       call mpp_error(FATAL,'compare_checksums_3d_r8: size of a and b does not match')
+
+     do k = 1, size(a,3)
+       do j = 1, size(a,2)
+          do i = 1, size(a,1)
+             if(a(i,j,k) .ne. b(i,j,k)) then
+                write(*,'(a,i3,a,i3,a,i3,a,i3,a,f20.9,a,f20.9)') trim(string)//" at pe ", mpp_pe(), &
+                     ", at point (",i,", ", j, ", ", k, "), a = ", a(i,j,k), ", b = ", b(i,j,k)
+                call mpp_error(FATAL, trim(string)//': mismatch in checksums at data point.')
+             endif
+          enddo
+       enddo
+     enddo
+
+     sum1 = mpp_chksum( a, (/pe/) )
+     sum2 = mpp_chksum( b, (/pe/) )
+
+     if( sum1.EQ.sum2 )then
+       if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(string)//': OK.' )
+        !--- in some case, even though checksum agree, the two arrays
+        !    actually are different, like comparing (1.1,-1.2) with (-1.1,1.2)
+        !--- hence we need to check the value point by point.
+     else
+       write(stdunit, *)"sum1 =", sum1, mpp_pe()
+       write(stdunit, *)"sum2 =", sum2, mpp_pe()
+       write(stdunit,'(a,i3,a,i20,a,i20)')" at pe ", mpp_pe(), " sum(a)=", sum1, " sum(b)=", sum2
+       call mpp_error( FATAL, trim(string)//': checksums do not match.' )
+     end if
+  end subroutine compare_checksums_3D_r8
+
+end module compare_data_checksums
diff --git a/test_fms/mpp/compare_data_checksums_int.F90 b/test_fms/mpp/compare_data_checksums_int.F90
new file mode 100644
index 0000000000..e99047b16e
--- /dev/null
+++ b/test_fms/mpp/compare_data_checksums_int.F90
@@ -0,0 +1,207 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Jessica Liptak
+!> @brief Compare the checksums of 2D and 3D 32-bit or 64-bit integer arrays
+module compare_data_checksums_int
+
+use mpp_mod, only : mpp_root_pe, mpp_chksum, mpp_error, mpp_sync_self, mpp_pe
+use mpp_mod, only : FATAL, NOTE
+use platform_mod, only : i4_kind, i8_kind
+
+implicit none
+private
+
+
+integer :: stdunit = 6
+
+public :: compare_checksums_int
+
+interface compare_checksums_int
+  module procedure compare_checksums_2D_i4
+  module procedure compare_checksums_3D_i4
+  module procedure compare_checksums_2D_i8
+  module procedure compare_checksums_3D_i8
+end interface compare_checksums_int
+
+contains
+
+  !> compare checksums of 2D 32-bit integer arrays
+  subroutine compare_checksums_2D_i4( a, b, chk_str )
+    integer(kind=i4_kind), intent(in), dimension(:,:) :: a, b !< 2D arrays to compare
+    character(len=*), intent(in) :: chk_str
+    integer(kind=i8_kind) :: sum1, sum2
+    integer :: i, j
+    integer :: pe
+    !> @note can't call mpp_sync here since there might be different number of tiles on each pe.
+    call mpp_sync_self()
+    pe = mpp_pe()
+
+    if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) ) &
+      call mpp_error(FATAL,'compare_checksums_2D_r4: sizes of a and b do not match')
+
+    do j = 1, size(a,2)
+      do i = 1, size(a,1)
+        if(a(i,j) .ne. b(i,j)) then
+          print*, "a =", a(i,j)
+          print*, "b =", b(i,j)
+          write(*,'(a,i3,a,i3,a,i3,a,f20.9,a,f20.9)')"at the pe ", mpp_pe(), &
+                ", at point (",i,", ", j, "),a=", a(i,j), ",b=", b(i,j)
+          call mpp_error(FATAL, trim(chk_str)//': value mismatch at data point.')
+        endif
+      enddo
+    enddo
+
+    sum1 = mpp_chksum( a, (/pe/) )
+    sum2 = mpp_chksum( b, (/pe/) )
+
+    if( sum1.EQ.sum2 )then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(chk_str)//': OK.' )
+      !> @note in some cases, even though the checksum agree, the two arrays
+      !! actually are different [e.g.,(1.1,-1.2) with (-1.1,1.2)].
+      !! Thus, we need to check the values point-by-point.
+    else
+      call mpp_error( FATAL, trim(chk_str)//': checksums do not match.' )
+    end if
+  end subroutine compare_checksums_2D_i4
+
+  !> Compare the checksums of 2 3D 32-bit real arrays
+  subroutine compare_checksums_3D_i4( a, b, string )
+     integer(kind=i4_kind), intent(in), dimension(:,:,:) :: a, b !< 3D 64-bit real arrays to compare
+     character(len=*), intent(in) :: string
+     integer(kind=i8_kind) :: sum1, sum2
+     integer :: i, j, k
+     integer :: pe
+    ! z1l can not call mpp_sync here since there might be different number of tiles on each pe.
+     call mpp_sync_self()
+     pe = mpp_pe()
+
+     if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) .or. size(a,3) .ne. size(b,3) ) &
+       call mpp_error(FATAL,'compare_checkums_3d_r4: sizes of a and b do not match')
+
+     do k = 1, size(a,3)
+       do j = 1, size(a,2)
+          do i = 1, size(a,1)
+             if(a(i,j,k) .ne. b(i,j,k)) then
+                write(*,'(a,i3,a,i3,a,i3,a,i3,a,f20.9,a,f20.9)') trim(string)//" at pe ", mpp_pe(), &
+                     ", at point (",i,", ", j, ", ", k, "), a = ", a(i,j,k), ", b = ", b(i,j,k)
+                call mpp_error(FATAL, trim(string)//': mismatch in checksums at data point.')
+             endif
+          enddo
+       enddo
+     enddo
+
+     sum1 = mpp_chksum( a, (/pe/) )
+     sum2 = mpp_chksum( b, (/pe/) )
+
+     if( sum1.EQ.sum2 )then
+       if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(string)//': OK.' )
+        !--- in some case, even though checksum agree, the two arrays
+        !    actually are different, like comparing (1.1,-1.2) with (-1.1,1.2)
+        !--- hence we need to check the value point by point.
+     else
+       write(stdunit, *)"sum1 =", sum1, mpp_pe()
+       write(stdunit, *)"sum2 =", sum2, mpp_pe()
+       write(stdunit,'(a,i3,a,i20,a,i20)')" at pe ", mpp_pe(), " sum(a)=", sum1, " sum(b)=", sum2
+       call mpp_error( FATAL, trim(string)//': checksums do not match.' )
+     end if
+  end subroutine compare_checksums_3D_i4
+
+  !> compare checksums of 2D 64-bit integer arrays
+  subroutine compare_checksums_2D_i8( a, b, chk_str )
+    integer(kind=i8_kind), intent(in), dimension(:,:) :: a, b !< 2D arrays to compare
+    character(len=*), intent(in) :: chk_str
+    integer(kind=i8_kind) :: sum1, sum2
+    integer :: i, j
+    integer :: pe
+
+    !> @note can't call mpp_sync here since there might be different number of tiles on each pe.
+    call mpp_sync_self()
+    pe = mpp_pe()
+
+    if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) ) &
+      call mpp_error(FATAL,'compare_checksums_2d_r8: sizes of a and b do not match')
+
+    do j = 1, size(a,2)
+      do i = 1, size(a,1)
+        if(a(i,j) .ne. b(i,j)) then
+          print*, "a =", a(i,j)
+          print*, "b =", b(i,j)
+          write(*,'(a,i3,a,i3,a,i3,a,f20.9,a,f20.9)')"at the pe ", mpp_pe(), &
+                ", at point (",i,", ", j, "),a=", a(i,j), ",b=", b(i,j)
+          call mpp_error(FATAL, trim(chk_str)//': value mismatch at data point.')
+        endif
+      enddo
+    enddo
+
+    sum1 = mpp_chksum( a, (/pe/) )
+    sum2 = mpp_chksum( b, (/pe/) )
+
+    if( sum1.EQ.sum2 )then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(chk_str)//': OK.' )
+      !> @note in some cases, even though the checksum agree, the two arrays
+      !! actually are different [e.g.,(1.1,-1.2) with (-1.1,1.2)].
+      !! Thus, we need to check the values point-by-point.
+    else
+      call mpp_error( FATAL, trim(chk_str)//': checksums do not match.' )
+    end if
+  end subroutine compare_checksums_2D_i8
+
+  !> Compare the checksums of 2 3D 64-bit real arrays
+  subroutine compare_checksums_3D_i8( a, b, string )
+     integer(kind=i8_kind), intent(in), dimension(:,:,:) :: a, b !< 3D 64-bit real arrays to compare
+     character(len=*), intent(in) :: string
+     integer(kind=i8_kind) :: sum1, sum2
+     integer :: i, j, k
+     integer :: pe
+    ! z1l can not call mpp_sync here since there might be different number of tiles on each pe.
+     call mpp_sync_self()
+     pe = mpp_pe()
+
+     if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) .or. size(a,3) .ne. size(b,3) ) &
+       call mpp_error(FATAL,'compare_checksums_3d_r8: size of a and b does not match')
+
+     do k = 1, size(a,3)
+       do j = 1, size(a,2)
+          do i = 1, size(a,1)
+             if(a(i,j,k) .ne. b(i,j,k)) then
+                write(*,'(a,i3,a,i3,a,i3,a,i3,a,f20.9,a,f20.9)') trim(string)//" at pe ", mpp_pe(), &
+                     ", at point (",i,", ", j, ", ", k, "), a = ", a(i,j,k), ", b = ", b(i,j,k)
+                call mpp_error(FATAL, trim(string)//': mismatch in checksums at data point.')
+             endif
+          enddo
+       enddo
+     enddo
+
+     sum1 = mpp_chksum( a, (/pe/) )
+     sum2 = mpp_chksum( b, (/pe/) )
+
+     if( sum1.EQ.sum2 )then
+       if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(string)//': OK.' )
+        !--- in some case, even though checksum agree, the two arrays
+        !    actually are different, like comparing (1.1,-1.2) with (-1.1,1.2)
+        !--- hence we need to check the value point by point.
+     else
+       write(stdunit, *)"sum1 =", sum1, mpp_pe()
+       write(stdunit, *)"sum2 =", sum2, mpp_pe()
+       write(stdunit,'(a,i3,a,i20,a,i20)')" at pe ", mpp_pe(), " sum(a)=", sum1, " sum(b)=", sum2
+       call mpp_error( FATAL, trim(string)//': checksums do not match.' )
+     end if
+  end subroutine compare_checksums_3D_i8
+
+end module compare_data_checksums_int
diff --git a/test_fms/mpp/empty.nml b/test_fms/mpp/empty.nml
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_fms/mpp/fill_halo.F90 b/test_fms/mpp/fill_halo.F90
new file mode 100644
index 0000000000..6fd94f28e5
--- /dev/null
+++ b/test_fms/mpp/fill_halo.F90
@@ -0,0 +1,710 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Jessica Liptak
+!> @brief This module contains routines to fill halos in different domain configurations
+!! It is required by test_mpp_update_domains_real and test_mpp_update_domains_int.
+module fill_halo
+
+use :: platform_mod
+
+implicit none
+private
+integer :: whalo = 2, ehalo = 2, shalo = 2, nhalo = 2
+integer :: nx=64, ny=64, nz=10
+
+public :: fill_halo_zero, fill_regular_refinement_halo, fill_regular_mosaic_halo
+public :: fill_folded_north_halo, fill_folded_south_halo, fill_folded_east_halo, fill_folded_west_halo
+
+!> Routines to fill halo regions of 64-bit and 32-bit real arrays on a regular grid
+interface fill_regular_refinement_halo
+  module procedure fill_regular_refinement_halo_r8
+  module procedure fill_regular_refinement_halo_r4
+  module procedure fill_regular_refinement_halo_i8
+  module procedure fill_regular_refinement_halo_i4
+end interface
+
+!> Routines to fill halo regions of 64-bit and 32-bit real arrays with zeros
+interface fill_halo_zero
+  module procedure fill_halo_zero_r8
+  module procedure fill_halo_zero_r4
+  module procedure fill_halo_zero_i8
+  module procedure fill_halo_zero_i4
+end interface
+
+!> Routines to fill halo regions of 64-bit and 32-bit real arrays on a mosaic grid
+interface fill_regular_mosaic_halo
+  module procedure fill_regular_mosaic_halo_r8
+  module procedure fill_regular_mosaic_halo_r4
+  module procedure fill_regular_mosaic_halo_i8
+  module procedure fill_regular_mosaic_halo_i4
+end interface fill_regular_mosaic_halo
+
+!> Routines to fill halo regions of 64-bit and 32-bit real arrays on a domain with a folded north edge
+interface fill_folded_north_halo
+  module procedure fill_folded_north_halo_r8
+  module procedure fill_folded_north_halo_r4
+  module procedure fill_folded_north_halo_i8
+  module procedure fill_folded_north_halo_i4
+end interface fill_folded_north_halo
+
+!> Routines to fill halo regions of 64-bit and 32-bit real arrays on a domain with a folded south edge
+interface fill_folded_south_halo
+  module procedure fill_folded_south_halo_r8
+  module procedure fill_folded_south_halo_r4
+  module procedure fill_folded_south_halo_i8
+  module procedure fill_folded_south_halo_i4
+end interface fill_folded_south_halo
+
+!> Routines to fill halo regions of 64-bit and 32-bit real arrays on a domain with a folded east edge
+interface fill_folded_east_halo
+  module procedure fill_folded_east_halo_r8
+  module procedure fill_folded_east_halo_r4
+  module procedure fill_folded_east_halo_i8
+  module procedure fill_folded_east_halo_i4
+end interface fill_folded_east_halo
+
+!> Routines to fill halo regions of 64-bit and 32-bit real arrays on a domain with a folded west edge
+interface fill_folded_west_halo
+  module procedure fill_folded_west_halo_r8
+  module procedure fill_folded_west_halo_r4
+  module procedure fill_folded_west_halo_i8
+  module procedure fill_folded_west_halo_i4
+end interface fill_folded_west_halo
+
+contains
+
+  !> fill the halo region of a 64-bit real array with zeros
+  subroutine fill_halo_zero_r8(data, whalo, ehalo, shalo, nhalo, xshift, yshift, isc, iec, jsc, jec, isd, ied, jsd, jed)
+    real(kind=r8_kind), dimension(isd:,jsd:,:), intent(inout) :: data
+    integer,                         intent(in) :: isc, iec, jsc, jec, isd, ied, jsd, jed
+    integer,                         intent(in) :: whalo, ehalo, shalo, nhalo, xshift, yshift
+
+    if(whalo >=0) then
+       data(iec+ehalo+1+xshift:ied+xshift,jsd:jed+yshift,:) = 0
+       data(isd:isc-whalo-1,jsd:jed+yshift,:) = 0
+    else
+       data(iec+1+xshift:iec-ehalo+xshift,jsc+shalo:jec-nhalo+yshift,:) = 0
+       data(isc+whalo:isc-1,jsc+shalo:jec-nhalo+yshift,:) = 0
+    end if
+
+    if(shalo>=0) then
+       data(isd:ied+xshift, jec+nhalo+1+yshift:jed+yshift,:) = 0
+       data(isd:ied+xshift, jsd:jsc-shalo-1,:) = 0
+    else
+       data(isc+whalo:iec-ehalo+xshift,jec+1+yshift:jec-nhalo+yshift,:) = 0
+       data(isc+whalo:iec-ehalo+xshift,jsc+shalo:jsc-1,:) = 0
+    end if
+  end subroutine fill_halo_zero_r8
+
+  !> fill the halo region of a 32-bit real array with zeros
+  subroutine fill_halo_zero_r4(data, whalo, ehalo, shalo, nhalo, xshift, yshift, isc, iec, jsc, jec, isd, ied, jsd, jed)
+    real(kind=r4_kind), dimension(isd:,jsd:,:), intent(inout) :: data
+    integer,                         intent(in) :: isc, iec, jsc, jec, isd, ied, jsd, jed
+    integer,                         intent(in) :: whalo, ehalo, shalo, nhalo, xshift, yshift
+
+    if(whalo >=0) then
+       data(iec+ehalo+1+xshift:ied+xshift,jsd:jed+yshift,:) = 0
+       data(isd:isc-whalo-1,jsd:jed+yshift,:) = 0
+    else
+       data(iec+1+xshift:iec-ehalo+xshift,jsc+shalo:jec-nhalo+yshift,:) = 0
+       data(isc+whalo:isc-1,jsc+shalo:jec-nhalo+yshift,:) = 0
+    end if
+
+    if(shalo>=0) then
+       data(isd:ied+xshift, jec+nhalo+1+yshift:jed+yshift,:) = 0
+       data(isd:ied+xshift, jsd:jsc-shalo-1,:) = 0
+    else
+       data(isc+whalo:iec-ehalo+xshift,jec+1+yshift:jec-nhalo+yshift,:) = 0
+       data(isc+whalo:iec-ehalo+xshift,jsc+shalo:jsc-1,:) = 0
+    end if
+  end subroutine fill_halo_zero_r4
+
+!> fill the halo region of a 64-bit integer array with zeros
+  subroutine fill_halo_zero_i8(data, whalo, ehalo, shalo, nhalo, xshift, yshift, isc, iec, jsc, jec, isd, ied, jsd, jed)
+    integer(kind=i8_kind), dimension(isd:,jsd:,:), intent(inout) :: data
+    integer,                         intent(in) :: isc, iec, jsc, jec, isd, ied, jsd, jed
+    integer,                         intent(in) :: whalo, ehalo, shalo, nhalo, xshift, yshift
+
+    if(whalo >=0) then
+       data(iec+ehalo+1+xshift:ied+xshift,jsd:jed+yshift,:) = 0
+       data(isd:isc-whalo-1,jsd:jed+yshift,:) = 0
+    else
+       data(iec+1+xshift:iec-ehalo+xshift,jsc+shalo:jec-nhalo+yshift,:) = 0
+       data(isc+whalo:isc-1,jsc+shalo:jec-nhalo+yshift,:) = 0
+    end if
+
+    if(shalo>=0) then
+       data(isd:ied+xshift, jec+nhalo+1+yshift:jed+yshift,:) = 0
+       data(isd:ied+xshift, jsd:jsc-shalo-1,:) = 0
+    else
+       data(isc+whalo:iec-ehalo+xshift,jec+1+yshift:jec-nhalo+yshift,:) = 0
+       data(isc+whalo:iec-ehalo+xshift,jsc+shalo:jsc-1,:) = 0
+    end if
+  end subroutine fill_halo_zero_i8
+
+!> fill the halo region of a 32-bit integer array with zeros
+  subroutine fill_halo_zero_i4(data, whalo, ehalo, shalo, nhalo, xshift, yshift, isc, iec, jsc, jec, isd, ied, jsd, jed)
+    integer(kind=i4_kind), dimension(isd:,jsd:,:), intent(inout) :: data
+    integer,                         intent(in) :: isc, iec, jsc, jec, isd, ied, jsd, jed
+    integer,                         intent(in) :: whalo, ehalo, shalo, nhalo, xshift, yshift
+
+    if(whalo >=0) then
+       data(iec+ehalo+1+xshift:ied+xshift,jsd:jed+yshift,:) = 0
+       data(isd:isc-whalo-1,jsd:jed+yshift,:) = 0
+    else
+       data(iec+1+xshift:iec-ehalo+xshift,jsc+shalo:jec-nhalo+yshift,:) = 0
+       data(isc+whalo:isc-1,jsc+shalo:jec-nhalo+yshift,:) = 0
+    end if
+
+    if(shalo>=0) then
+       data(isd:ied+xshift, jec+nhalo+1+yshift:jed+yshift,:) = 0
+       data(isd:ied+xshift, jsd:jsc-shalo-1,:) = 0
+    else
+       data(isc+whalo:iec-ehalo+xshift,jec+1+yshift:jec-nhalo+yshift,:) = 0
+       data(isc+whalo:iec-ehalo+xshift,jsc+shalo:jsc-1,:) = 0
+    end if
+  end subroutine fill_halo_zero_i4
+
+
+  !> fill the halo region of 64-bit array on a regular grid
+  subroutine fill_regular_refinement_halo_r8( data, data_all, ni, nj, tm, te, tse, ts, &
+                                             tsw, tw, tnw, tn, tne, ioff, joff )
+    real(kind=r8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    real(kind=r8_kind), dimension(:,:,:,:),             intent(in)    :: data_all
+    integer, dimension(:),                intent(in)    :: ni, nj
+    integer,                              intent(in)    :: tm, te, tse, ts, tsw, tw, tnw, tn, tne
+    integer,                              intent(in)    :: ioff, joff
+
+
+    if(te>0) data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, 1:nj(tm)+joff,                   :) = &
+             data_all(1+ioff:ehalo+ioff,               1:nj(te)+joff,                   :,te)  ! east
+    if(ts>0) data    (1:ni(tm)+ioff,                   1-shalo:0,                       :) = &
+             data_all(1:ni(ts)+ioff,                   nj(ts)-shalo+1:nj(ts),           :,ts)  ! south
+    if(tw>0) data    (1-whalo:0,                       1:nj(tm)+joff,                   :) = &
+             data_all(ni(tw)-whalo+1:ni(tw),           1:nj(tw)+joff,                   :,tw)  ! west
+    if(tn>0) data    (1:ni(tm)+ioff,                   nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(1:ni(tn)+ioff,                   1+joff:nhalo+joff,               :,tn)  ! north
+    if(tse>0)data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, 1-shalo:0,                       :) = &
+             data_all(1+ioff:ehalo+ioff,               nj(tse)-shalo+1:nj(tse),         :,tse) ! southeast
+    if(tsw>0)data    (1-whalo:0,                       1-shalo:0,                       :) = &
+             data_all(ni(tsw)-whalo+1:ni(tsw),         nj(tsw)-shalo+1:nj(tsw),         :,tsw) ! southwest
+    if(tne>0)data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(1+ioff:ehalo+ioff,               1+joff:nhalo+joff,               :,tnw) ! northeast
+    if(tnw>0)data    (1-whalo:0,                       nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(ni(tnw)-whalo+1:ni(tnw),         1+joff:nhalo+joff,               :,tne) ! northwest
+
+  end subroutine fill_regular_refinement_halo_r8
+
+  !> fill the halo region of 32-bit array on a regular grid
+  subroutine fill_regular_refinement_halo_r4( data, data_all, ni, nj, tm, te, tse, ts, tsw, tw, tnw, tn, tne, ioff, joff )
+    real(kind=r4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    real(kind=r4_kind), dimension(:,:,:,:),             intent(in)    :: data_all
+    integer, dimension(:),                intent(in)    :: ni, nj
+    integer,                              intent(in)    :: tm, te, tse, ts, tsw, tw, tnw, tn, tne
+    integer,                              intent(in)    :: ioff, joff
+
+
+    if(te>0) data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, 1:nj(tm)+joff,                   :) = &
+             data_all(1+ioff:ehalo+ioff,               1:nj(te)+joff,                   :,te)  ! east
+    if(ts>0) data    (1:ni(tm)+ioff,                   1-shalo:0,                       :) = &
+             data_all(1:ni(ts)+ioff,                   nj(ts)-shalo+1:nj(ts),           :,ts)  ! south
+    if(tw>0) data    (1-whalo:0,                       1:nj(tm)+joff,                   :) = &
+             data_all(ni(tw)-whalo+1:ni(tw),           1:nj(tw)+joff,                   :,tw)  ! west
+    if(tn>0) data    (1:ni(tm)+ioff,                   nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(1:ni(tn)+ioff,                   1+joff:nhalo+joff,               :,tn)  ! north
+    if(tse>0)data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, 1-shalo:0,                       :) = &
+             data_all(1+ioff:ehalo+ioff,               nj(tse)-shalo+1:nj(tse),         :,tse) ! southeast
+    if(tsw>0)data    (1-whalo:0,                       1-shalo:0,                       :) = &
+             data_all(ni(tsw)-whalo+1:ni(tsw),         nj(tsw)-shalo+1:nj(tsw),         :,tsw) ! southwest
+    if(tne>0)data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(1+ioff:ehalo+ioff,               1+joff:nhalo+joff,               :,tnw) ! northeast
+    if(tnw>0)data    (1-whalo:0,                       nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(ni(tnw)-whalo+1:ni(tnw),         1+joff:nhalo+joff,               :,tne) ! northwest
+
+  end subroutine fill_regular_refinement_halo_r4
+
+!> fill the halo region of 64-bit integer array on a regular grid
+  subroutine fill_regular_refinement_halo_i8( data, data_all, ni, nj, tm, te, tse, ts, tsw, &
+                                             tw, tnw, tn, tne, ioff, joff )
+    integer(kind=i8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer(kind=i8_kind), dimension(:,:,:,:),             intent(in)    :: data_all
+    integer, dimension(:),                intent(in)    :: ni, nj
+    integer,                              intent(in)    :: tm, te, tse, ts, tsw, tw, tnw, tn, tne
+    integer,                              intent(in)    :: ioff, joff
+
+
+    if(te>0) data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, 1:nj(tm)+joff,                   :) = &
+             data_all(1+ioff:ehalo+ioff,               1:nj(te)+joff,                   :,te)  ! east
+    if(ts>0) data    (1:ni(tm)+ioff,                   1-shalo:0,                       :) = &
+             data_all(1:ni(ts)+ioff,                   nj(ts)-shalo+1:nj(ts),           :,ts)  ! south
+    if(tw>0) data    (1-whalo:0,                       1:nj(tm)+joff,                   :) = &
+             data_all(ni(tw)-whalo+1:ni(tw),           1:nj(tw)+joff,                   :,tw)  ! west
+    if(tn>0) data    (1:ni(tm)+ioff,                   nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(1:ni(tn)+ioff,                   1+joff:nhalo+joff,               :,tn)  ! north
+    if(tse>0)data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, 1-shalo:0,                       :) = &
+             data_all(1+ioff:ehalo+ioff,               nj(tse)-shalo+1:nj(tse),         :,tse) ! southeast
+    if(tsw>0)data    (1-whalo:0,                       1-shalo:0,                       :) = &
+             data_all(ni(tsw)-whalo+1:ni(tsw),         nj(tsw)-shalo+1:nj(tsw),         :,tsw) ! southwest
+    if(tne>0)data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(1+ioff:ehalo+ioff,               1+joff:nhalo+joff,               :,tnw) ! northeast
+    if(tnw>0)data    (1-whalo:0,                       nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(ni(tnw)-whalo+1:ni(tnw),         1+joff:nhalo+joff,               :,tne) ! northwest
+
+  end subroutine fill_regular_refinement_halo_i8
+
+!> fill the halo region of 32-bit integer array on a regular grid
+  subroutine fill_regular_refinement_halo_i4( data, data_all, ni, nj, tm, te, tse, ts, tsw, tw, tnw, tn, tne, ioff, joff )
+    integer(kind=i4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer(kind=i4_kind), dimension(:,:,:,:),             intent(in)    :: data_all
+    integer, dimension(:),                intent(in)    :: ni, nj
+    integer,                              intent(in)    :: tm, te, tse, ts, tsw, tw, tnw, tn, tne
+    integer,                              intent(in)    :: ioff, joff
+
+
+    if(te>0) data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, 1:nj(tm)+joff,                   :) = &
+             data_all(1+ioff:ehalo+ioff,               1:nj(te)+joff,                   :,te)  ! east
+    if(ts>0) data    (1:ni(tm)+ioff,                   1-shalo:0,                       :) = &
+             data_all(1:ni(ts)+ioff,                   nj(ts)-shalo+1:nj(ts),           :,ts)  ! south
+    if(tw>0) data    (1-whalo:0,                       1:nj(tm)+joff,                   :) = &
+             data_all(ni(tw)-whalo+1:ni(tw),           1:nj(tw)+joff,                   :,tw)  ! west
+    if(tn>0) data    (1:ni(tm)+ioff,                   nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(1:ni(tn)+ioff,                   1+joff:nhalo+joff,               :,tn)  ! north
+    if(tse>0)data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, 1-shalo:0,                       :) = &
+             data_all(1+ioff:ehalo+ioff,               nj(tse)-shalo+1:nj(tse),         :,tse) ! southeast
+    if(tsw>0)data    (1-whalo:0,                       1-shalo:0,                       :) = &
+             data_all(ni(tsw)-whalo+1:ni(tsw),         nj(tsw)-shalo+1:nj(tsw),         :,tsw) ! southwest
+    if(tne>0)data    (ni(tm)+1+ioff:ni(tm)+ehalo+ioff, nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(1+ioff:ehalo+ioff,               1+joff:nhalo+joff,               :,tnw) ! northeast
+    if(tnw>0)data    (1-whalo:0,                       nj(tm)+1+joff:nj(tm)+nhalo+joff, :) = &
+             data_all(ni(tnw)-whalo+1:ni(tnw),         1+joff:nhalo+joff,               :,tne) ! northwest
+
+  end subroutine fill_regular_refinement_halo_i4
+
+  ! Fill the halo points of a 64-bit real array on the regular mosaic grid
+  subroutine fill_regular_mosaic_halo_r8(data, data_all, te, tse, ts, tsw, tw, tnw, tn, tne)
+    real(kind=r8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    real(kind=r8_kind), dimension(:,:,:,:),             intent(in)    :: data_all
+    integer, intent(in)    :: te, tse, ts, tsw, tw, tnw, tn, tne
+
+    data(nx+1:nx+ehalo, 1:ny,          :) = data_all(1:ehalo,       1:ny,          :, te) ! east
+    data(1:nx,          1-shalo:0,     :) = data_all(1:nx,          ny-shalo+1:ny, :, ts) ! south
+    data(1-whalo:0,     1:ny,          :) = data_all(nx-whalo+1:nx, 1:ny,          :, tw) ! west
+    data(1:nx,          ny+1:ny+nhalo, :) = data_all(1:nx,          1:nhalo,       :, tn) ! north
+    data(nx+1:nx+ehalo, 1-shalo:0,     :) = data_all(1:ehalo,       ny-shalo+1:ny, :,tse) ! southeast
+    data(1-whalo:0,     1-shalo:0,     :) = data_all(nx-whalo+1:nx, ny-shalo+1:ny, :,tsw) ! southwest
+    data(nx+1:nx+ehalo, ny+1:ny+nhalo, :) = data_all(1:ehalo,       1:nhalo,       :,tnw) ! northeast
+    data(1-whalo:0,     ny+1:ny+nhalo, :) = data_all(nx-whalo+1:nx, 1:nhalo,       :,tne) ! northwest
+  end subroutine fill_regular_mosaic_halo_r8
+
+  !> Fill the halo points of a 32-bit real array on the regular mosaic grid
+  subroutine fill_regular_mosaic_halo_r4(data, data_all, te, tse, ts, tsw, tw, tnw, tn, tne)
+    real(kind=r4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    real(kind=r4_kind), dimension(:,:,:,:),             intent(in)    :: data_all
+    integer, intent(in)    :: te, tse, ts, tsw, tw, tnw, tn, tne
+
+    data(nx+1:nx+ehalo, 1:ny,          :) = data_all(1:ehalo,       1:ny,          :, te) ! east
+    data(1:nx,          1-shalo:0,     :) = data_all(1:nx,          ny-shalo+1:ny, :, ts) ! south
+    data(1-whalo:0,     1:ny,          :) = data_all(nx-whalo+1:nx, 1:ny,          :, tw) ! west
+    data(1:nx,          ny+1:ny+nhalo, :) = data_all(1:nx,          1:nhalo,       :, tn) ! north
+    data(nx+1:nx+ehalo, 1-shalo:0,     :) = data_all(1:ehalo,       ny-shalo+1:ny, :,tse) ! southeast
+    data(1-whalo:0,     1-shalo:0,     :) = data_all(nx-whalo+1:nx, ny-shalo+1:ny, :,tsw) ! southwest
+    data(nx+1:nx+ehalo, ny+1:ny+nhalo, :) = data_all(1:ehalo,       1:nhalo,       :,tnw) ! northeast
+    data(1-whalo:0,     ny+1:ny+nhalo, :) = data_all(nx-whalo+1:nx, 1:nhalo,       :,tne) ! northwest
+  end subroutine fill_regular_mosaic_halo_r4
+
+  ! Fill the halo points of a 64-bit integer array on the regular mosaic grid
+  subroutine fill_regular_mosaic_halo_i8(data, data_all, te, tse, ts, tsw, tw, tnw, tn, tne)
+    integer(kind=i8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer(kind=i8_kind), dimension(:,:,:,:),             intent(in)    :: data_all
+    integer, intent(in)    :: te, tse, ts, tsw, tw, tnw, tn, tne
+
+    data(nx+1:nx+ehalo, 1:ny,          :) = data_all(1:ehalo,       1:ny,          :, te) ! east
+    data(1:nx,          1-shalo:0,     :) = data_all(1:nx,          ny-shalo+1:ny, :, ts) ! south
+    data(1-whalo:0,     1:ny,          :) = data_all(nx-whalo+1:nx, 1:ny,          :, tw) ! west
+    data(1:nx,          ny+1:ny+nhalo, :) = data_all(1:nx,          1:nhalo,       :, tn) ! north
+    data(nx+1:nx+ehalo, 1-shalo:0,     :) = data_all(1:ehalo,       ny-shalo+1:ny, :,tse) ! southeast
+    data(1-whalo:0,     1-shalo:0,     :) = data_all(nx-whalo+1:nx, ny-shalo+1:ny, :,tsw) ! southwest
+    data(nx+1:nx+ehalo, ny+1:ny+nhalo, :) = data_all(1:ehalo,       1:nhalo,       :,tnw) ! northeast
+    data(1-whalo:0,     ny+1:ny+nhalo, :) = data_all(nx-whalo+1:nx, 1:nhalo,       :,tne) ! northwest
+  end subroutine fill_regular_mosaic_halo_i8
+
+  !> Fill the halo points of a 64-bit integer array on the regular mosaic grid
+  subroutine fill_regular_mosaic_halo_i4(data, data_all, te, tse, ts, tsw, tw, tnw, tn, tne)
+    integer(kind=i4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer(kind=i4_kind), dimension(:,:,:,:),             intent(in)    :: data_all
+    integer, intent(in)    :: te, tse, ts, tsw, tw, tnw, tn, tne
+
+    data(nx+1:nx+ehalo, 1:ny,          :) = data_all(1:ehalo,       1:ny,          :, te) ! east
+    data(1:nx,          1-shalo:0,     :) = data_all(1:nx,          ny-shalo+1:ny, :, ts) ! south
+    data(1-whalo:0,     1:ny,          :) = data_all(nx-whalo+1:nx, 1:ny,          :, tw) ! west
+    data(1:nx,          ny+1:ny+nhalo, :) = data_all(1:nx,          1:nhalo,       :, tn) ! north
+    data(nx+1:nx+ehalo, 1-shalo:0,     :) = data_all(1:ehalo,       ny-shalo+1:ny, :,tse) ! southeast
+    data(1-whalo:0,     1-shalo:0,     :) = data_all(nx-whalo+1:nx, ny-shalo+1:ny, :,tsw) ! southwest
+    data(nx+1:nx+ehalo, ny+1:ny+nhalo, :) = data_all(1:ehalo,       1:nhalo,       :,tnw) ! northeast
+    data(1-whalo:0,     ny+1:ny+nhalo, :) = data_all(nx-whalo+1:nx, 1:nhalo,       :,tne) ! northwest
+  end subroutine fill_regular_mosaic_halo_i4
+
+  !> Fill the halo region of a 64-bit array real on a domain with a folded north edge
+  subroutine fill_folded_north_halo_r8(data, ioff, joff, ishift, jshift, sign)
+    real(kind=r8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = ishift - ioff
+    m2 = 2*ishift - ioff
+
+    data(1-whalo:0,1:nyp,:) = data(nx-whalo+1:nx,1:ny+jshift,:) ! west
+    data(nx+1:nx+ehalo+ishift,1:nyp,:) = data(1:ehalo+ishift,1:ny+jshift,:) ! east
+    if(m1 .GE. 1-whalo) &
+      data(1-whalo:m1,nyp+1:nyp+nhalo,:) = sign*data(whalo+m2:1+ishift:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+    data(m1+1:nx+m2,nyp+1:nyp+nhalo,:) = sign*data(nx+ishift:1:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+    data(nx+m2+1:nxp+ehalo,nyp+1:nyp+nhalo,:) = sign*data(nx:nx-ehalo+m1+1:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+
+  end subroutine fill_folded_north_halo_r8
+
+  !> Fill the halo region of a 32-bit real array on a domain with a folded north edge
+  subroutine fill_folded_north_halo_r4(data, ioff, joff, ishift, jshift, sign)
+    real(kind=r4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = ishift - ioff
+    m2 = 2*ishift - ioff
+
+    data(1-whalo:0,1:nyp,:) = data(nx-whalo+1:nx,1:ny+jshift,:) ! west
+    data(nx+1:nx+ehalo+ishift,1:nyp,:) = data(1:ehalo+ishift,1:ny+jshift,:) ! east
+
+    if(m1 .GE. 1-whalo) &
+      data(1-whalo:m1,nyp+1:nyp+nhalo,:) = sign*data(whalo+m2:1+ishift:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+    data(m1+1:nx+m2,nyp+1:nyp+nhalo,:) = sign*data(nx+ishift:1:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+    data(nx+m2+1:nxp+ehalo,nyp+1:nyp+nhalo,:) = sign*data(nx:nx-ehalo+m1+1:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+
+  end subroutine fill_folded_north_halo_r4
+
+  !> Fill the halo region of a 64-bit integer array on a domain with a folded north edge
+  subroutine fill_folded_north_halo_i8(data, ioff, joff, ishift, jshift, sign)
+    integer(kind=i8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = ishift - ioff
+    m2 = 2*ishift - ioff
+
+    data(1-whalo:0,1:nyp,:) = data(nx-whalo+1:nx,1:ny+jshift,:) ! west
+    data(nx+1:nx+ehalo+ishift,1:nyp,:) = data(1:ehalo+ishift,1:ny+jshift,:) ! east
+    if(m1 .GE. 1-whalo) &
+      data(1-whalo:m1,nyp+1:nyp+nhalo,:) = sign*data(whalo+m2:1+ishift:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+    data(m1+1:nx+m2,nyp+1:nyp+nhalo,:) = sign*data(nx+ishift:1:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+    data(nx+m2+1:nxp+ehalo,nyp+1:nyp+nhalo,:) = sign*data(nx:nx-ehalo+m1+1:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+
+  end subroutine fill_folded_north_halo_i8
+
+  !> Fill the halo region of a 32-bit integer array on a domain with a folded north edge
+  subroutine fill_folded_north_halo_i4(data, ioff, joff, ishift, jshift, sign)
+    integer(kind=i4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = ishift - ioff
+    m2 = 2*ishift - ioff
+
+    data(1-whalo:0,1:nyp,:) = data(nx-whalo+1:nx,1:ny+jshift,:) ! west
+    data(nx+1:nx+ehalo+ishift,1:nyp,:) = data(1:ehalo+ishift,1:ny+jshift,:) ! east
+
+    if(m1 .GE. 1-whalo) &
+      data(1-whalo:m1,nyp+1:nyp+nhalo,:) = sign*data(whalo+m2:1+ishift:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+    data(m1+1:nx+m2,nyp+1:nyp+nhalo,:) = sign*data(nx+ishift:1:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+    data(nx+m2+1:nxp+ehalo,nyp+1:nyp+nhalo,:) = sign*data(nx:nx-ehalo+m1+1:-1,nyp-joff:nyp-nhalo-joff+1:-1,:)
+
+  end subroutine fill_folded_north_halo_i4
+
+  !> Fill the halo region of a 64-bit real array on a domain with a folded south edge
+  subroutine fill_folded_south_halo_r8(data, ioff, joff, ishift, jshift, sign)
+    real(kind=r8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer  :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = ishift - ioff
+    m2 = 2*ishift - ioff
+
+    data(1-whalo:0,1:nyp,:) = data(nx-whalo+1:nx,1:nyp,:) ! west
+    data(nx+1:nx+ehalo+ishift,1:nyp,:) = data(1:ehalo+ishift, 1:nyp,:) ! east
+    if(m1 .GE. 1-whalo) &
+      data(1-whalo:m1,1-shalo:0,:) = sign*data(whalo+m2:1+ishift:-1, shalo+jshift:1+jshift:-1,:)
+
+    data(m1+1:nx+m2,1-shalo:0,:) = sign*data(nxp:1:-1,shalo+jshift:1+jshift:-1,:)
+    data(nx+m2+1:nxp+ehalo,1-shalo:0,:) = sign*data(nx:nx-ehalo+m1+1:-1,shalo+jshift:1+jshift:-1,:)
+
+  end subroutine fill_folded_south_halo_r8
+
+  !> Fill the halo region of a 32-bit real array on a domain with a folded south edge
+  subroutine fill_folded_south_halo_r4(data, ioff, joff, ishift, jshift, sign)
+    real(kind=r4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer  :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = ishift - ioff
+    m2 = 2*ishift - ioff
+
+    data(1-whalo:0,1:nyp,:) = data(nx-whalo+1:nx,1:nyp,:) ! west
+    data(nx+1:nx+ehalo+ishift,1:nyp,:) = data(1:ehalo+ishift, 1:nyp,:) ! east
+    if(m1 .GE. 1-whalo) &
+      data(1-whalo:m1,1-shalo:0,:) = sign*data(whalo+m2:1+ishift:-1, shalo+jshift:1+jshift:-1,:)
+
+    data(m1+1:nx+m2,1-shalo:0,:) = sign*data(nxp:1:-1,shalo+jshift:1+jshift:-1,:)
+    data(nx+m2+1:nxp+ehalo,1-shalo:0,:) = sign*data(nx:nx-ehalo+m1+1:-1,shalo+jshift:1+jshift:-1,:)
+
+  end subroutine fill_folded_south_halo_r4
+
+  !> Fill the halo region of a 64-bit intger array on a domain with a folded south edge
+  subroutine fill_folded_south_halo_i8(data, ioff, joff, ishift, jshift, sign)
+    integer(kind=i8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer  :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = ishift - ioff
+    m2 = 2*ishift - ioff
+
+    data(1-whalo:0,1:nyp,:) = data(nx-whalo+1:nx,1:nyp,:) ! west
+    data(nx+1:nx+ehalo+ishift,1:nyp,:) = data(1:ehalo+ishift, 1:nyp,:) ! east
+    if(m1 .GE. 1-whalo) &
+      data(1-whalo:m1,1-shalo:0,:) = sign*data(whalo+m2:1+ishift:-1, shalo+jshift:1+jshift:-1,:)
+
+    data(m1+1:nx+m2,1-shalo:0,:) = sign*data(nxp:1:-1,shalo+jshift:1+jshift:-1,:)
+    data(nx+m2+1:nxp+ehalo,1-shalo:0,:) = sign*data(nx:nx-ehalo+m1+1:-1,shalo+jshift:1+jshift:-1,:)
+
+  end subroutine fill_folded_south_halo_i8
+
+  !> Fill the halo region of a 32-bit integer array on a domain with a folded south edge
+  subroutine fill_folded_south_halo_i4(data, ioff, joff, ishift, jshift, sign)
+    integer(kind=i4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer  :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = ishift - ioff
+    m2 = 2*ishift - ioff
+
+    data(1-whalo:0,1:nyp,:) = data(nx-whalo+1:nx,1:nyp,:) ! west
+    data(nx+1:nx+ehalo+ishift,1:nyp,:) = data(1:ehalo+ishift, 1:nyp,:) ! east
+    if(m1 .GE. 1-whalo) &
+      data(1-whalo:m1,1-shalo:0,:) = sign*data(whalo+m2:1+ishift:-1, shalo+jshift:1+jshift:-1,:)
+
+    data(m1+1:nx+m2,1-shalo:0,:) = sign*data(nxp:1:-1,shalo+jshift:1+jshift:-1,:)
+    data(nx+m2+1:nxp+ehalo,1-shalo:0,:) = sign*data(nx:nx-ehalo+m1+1:-1,shalo+jshift:1+jshift:-1,:)
+
+  end subroutine fill_folded_south_halo_i4
+
+  !> Fill the halo region of a 64-bit real array on a domain with a folded west edge
+  subroutine fill_folded_west_halo_r8(data, ioff, joff, ishift, jshift, sign)
+    real(kind=r8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = jshift - joff
+    m2 = 2*jshift - joff
+
+    data(1:nxp, 1-shalo:0,:) = data(1:nxp, ny-shalo+1:ny, :) ! south
+    data(1:nxp, ny+1:nyp+nhalo,:) = data(1:nxp, 1:nhalo+jshift,:) ! north
+    if(m1 .GE. 1-shalo) &
+      data(1-whalo:0, 1-shalo:m1,:) = sign*data(whalo+ishift:1+ishift:-1,shalo+m2:1+jshift:-1,:)
+    data(1-whalo:0, m1+1:ny+m2,:) = sign*data(whalo+ishift:1+ishift:-1, nyp:1:-1, :)
+    data(1-whalo:0, ny+m2+1:nyp+nhalo,:) = sign*data(whalo+ishift:1+ishift:-1, ny:ny-nhalo+m1+1:-1,:)
+
+  end subroutine fill_folded_west_halo_r8
+
+  !> Fill the halo region of a 32-bit real array on a domain with a folded west edge
+  subroutine fill_folded_west_halo_r4(data, ioff, joff, ishift, jshift, sign)
+    real(kind=r4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = jshift - joff
+    m2 = 2*jshift - joff
+
+    data(1:nxp, 1-shalo:0,:) = data(1:nxp, ny-shalo+1:ny, :) ! south
+    data(1:nxp, ny+1:nyp+nhalo,:) = data(1:nxp, 1:nhalo+jshift,:) ! north
+    if(m1 .GE. 1-shalo) &
+      data(1-whalo:0, 1-shalo:m1,:) = sign*data(whalo+ishift:1+ishift:-1,shalo+m2:1+jshift:-1,:)
+    data(1-whalo:0, m1+1:ny+m2,:) = sign*data(whalo+ishift:1+ishift:-1, nyp:1:-1, :)
+    data(1-whalo:0, ny+m2+1:nyp+nhalo,:) = sign*data(whalo+ishift:1+ishift:-1, ny:ny-nhalo+m1+1:-1,:)
+
+  end subroutine fill_folded_west_halo_r4
+
+  !> Fill the halo region of a 64-bit integer array on a domain with a folded west edge
+  subroutine fill_folded_west_halo_i8(data, ioff, joff, ishift, jshift, sign)
+    integer(kind=i8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = jshift - joff
+    m2 = 2*jshift - joff
+
+    data(1:nxp, 1-shalo:0,:) = data(1:nxp, ny-shalo+1:ny, :) ! south
+    data(1:nxp, ny+1:nyp+nhalo,:) = data(1:nxp, 1:nhalo+jshift,:) ! north
+    if(m1 .GE. 1-shalo) &
+      data(1-whalo:0, 1-shalo:m1,:) = sign*data(whalo+ishift:1+ishift:-1,shalo+m2:1+jshift:-1,:)
+    data(1-whalo:0, m1+1:ny+m2,:) = sign*data(whalo+ishift:1+ishift:-1, nyp:1:-1, :)
+    data(1-whalo:0, ny+m2+1:nyp+nhalo,:) = sign*data(whalo+ishift:1+ishift:-1, ny:ny-nhalo+m1+1:-1,:)
+
+  end subroutine fill_folded_west_halo_i8
+
+  !> Fill the halo region of a 32-bit integer array on a domain with a folded west edge
+  subroutine fill_folded_west_halo_i4(data, ioff, joff, ishift, jshift, sign)
+    integer(kind=i4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = jshift - joff
+    m2 = 2*jshift - joff
+
+    data(1:nxp, 1-shalo:0,:) = data(1:nxp, ny-shalo+1:ny, :) ! south
+    data(1:nxp, ny+1:nyp+nhalo,:) = data(1:nxp, 1:nhalo+jshift,:) ! north
+    if(m1 .GE. 1-shalo) &
+      data(1-whalo:0, 1-shalo:m1,:) = sign*data(whalo+ishift:1+ishift:-1,shalo+m2:1+jshift:-1,:)
+    data(1-whalo:0, m1+1:ny+m2,:) = sign*data(whalo+ishift:1+ishift:-1, nyp:1:-1, :)
+    data(1-whalo:0, ny+m2+1:nyp+nhalo,:) = sign*data(whalo+ishift:1+ishift:-1, ny:ny-nhalo+m1+1:-1,:)
+
+  end subroutine fill_folded_west_halo_i4
+
+  !> Fill the halo region of a 64-bit real array on a domain with a folded east edge
+  subroutine fill_folded_east_halo_r8(data, ioff, joff, ishift, jshift, sign)
+    real(kind=r8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = jshift - joff
+    m2 = 2*jshift - joff
+
+    data(1:nxp, 1-shalo:0, :)  = data(1:nxp, ny-shalo+1:ny, :) ! south
+    data(1:nxp, ny+1:nyp+nhalo, :) = data(1:nxp, 1:nhalo+jshift,:) ! north
+    if(m1 .GE. 1-shalo) &
+      data(nxp+1:nxp+ehalo, 1-shalo:m1, :) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, shalo+m2:1+jshift:-1,:)
+
+    data(nxp+1:nxp+ehalo, m1+1:ny+m2, :) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, nyp:1:-1, :)
+    data(nxp+1:nxp+ehalo, ny+m2+1:nyp+nhalo,:) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, ny:ny-nhalo+m1+1:-1,:)
+
+  end subroutine fill_folded_east_halo_r8
+
+  !> Fill the halo region of a 32-bit real array on a domain with a folded east edge
+  subroutine fill_folded_east_halo_r4(data, ioff, joff, ishift, jshift, sign)
+    real(kind=r4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = jshift - joff
+    m2 = 2*jshift - joff
+
+    data(1:nxp, 1-shalo:0, :)  = data(1:nxp, ny-shalo+1:ny, :) ! south
+    data(1:nxp, ny+1:nyp+nhalo, :) = data(1:nxp, 1:nhalo+jshift,:) ! north
+    if(m1 .GE. 1-shalo) &
+      data(nxp+1:nxp+ehalo, 1-shalo:m1, :) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, shalo+m2:1+jshift:-1,:)
+
+    data(nxp+1:nxp+ehalo, m1+1:ny+m2, :) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, nyp:1:-1, :)
+    data(nxp+1:nxp+ehalo, ny+m2+1:nyp+nhalo,:) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, ny:ny-nhalo+m1+1:-1,:)
+
+  end subroutine fill_folded_east_halo_r4
+
+  !> Fill the halo region of a 64-bit integer array on a domain with a folded east edge
+  subroutine fill_folded_east_halo_i8(data, ioff, joff, ishift, jshift, sign)
+    integer(kind=i8_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = jshift - joff
+    m2 = 2*jshift - joff
+
+    data(1:nxp, 1-shalo:0, :)  = data(1:nxp, ny-shalo+1:ny, :) ! south
+    data(1:nxp, ny+1:nyp+nhalo, :) = data(1:nxp, 1:nhalo+jshift,:) ! north
+    if(m1 .GE. 1-shalo) &
+      data(nxp+1:nxp+ehalo, 1-shalo:m1, :) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, shalo+m2:1+jshift:-1,:)
+
+    data(nxp+1:nxp+ehalo, m1+1:ny+m2, :) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, nyp:1:-1, :)
+    data(nxp+1:nxp+ehalo, ny+m2+1:nyp+nhalo,:) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, ny:ny-nhalo+m1+1:-1,:)
+
+  end subroutine fill_folded_east_halo_i8
+
+  !> Fill the halo region of a 32-bit integer array on a domain with a folded east edge
+  subroutine fill_folded_east_halo_i4(data, ioff, joff, ishift, jshift, sign)
+    integer(kind=i4_kind), dimension(1-whalo:,1-shalo:,:), intent(inout) :: data
+    integer, intent(in) :: ioff, joff, ishift, jshift, sign
+    ! local
+    integer :: nxp, nyp, m1, m2
+
+    nxp = nx+ishift
+    nyp = ny+jshift
+    m1 = jshift - joff
+    m2 = 2*jshift - joff
+
+    data(1:nxp, 1-shalo:0, :)  = data(1:nxp, ny-shalo+1:ny, :) ! south
+    data(1:nxp, ny+1:nyp+nhalo, :) = data(1:nxp, 1:nhalo+jshift,:) ! north
+    if(m1 .GE. 1-shalo) &
+      data(nxp+1:nxp+ehalo, 1-shalo:m1, :) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, shalo+m2:1+jshift:-1,:)
+
+    data(nxp+1:nxp+ehalo, m1+1:ny+m2, :) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, nyp:1:-1, :)
+    data(nxp+1:nxp+ehalo, ny+m2+1:nyp+nhalo,:) = sign*data(nxp-ioff:nxp-ehalo-ioff+1:-1, ny:ny-nhalo+m1+1:-1,:)
+
+  end subroutine fill_folded_east_halo_i4
+
+end module fill_halo
+
diff --git a/test_fms/mpp/input.nml b/test_fms/mpp/input.nml
new file mode 100644
index 0000000000..bc103e5ef1
--- /dev/null
+++ b/test_fms/mpp/input.nml
@@ -0,0 +1,67 @@
+&test_mpp_pset_nml
+test_number = <test_num>
+/
+
+&test_mpp_get_ascii_lines_nml
+test_number = <test_num>
+/
+
+
+&test_mpp_domains_nml
+nx=64
+ny=64
+nz=10
+stackmax=10000000
+debug=.false.
+mpes = 3
+check_parallel = .false.
+whalo = 2
+ehalo = 2
+shalo = 2
+nhalo = 2
+x_cyclic_offset = 3
+y_cyclic_offset = -4
+warn_level = "fatal"
+wide_halo_x = 0
+wide_halo_y = 0
+nx_cubic = 20
+ny_cubic = 20
+test_performance = .false.
+test_interface = .false.
+num_fields = 4
+do_sleep = .false.
+num_iter = 1
+! NEST inputs
+test_nest = .false.
+num_nest = 3
+tile_coarse =    1,  3,  7
+tile_fine   =    7 , 8,  9
+istart_coarse =  3,  3,  5
+icount_coarse = 40,  5,  6
+jstart_coarse =  3,  3,  6
+jcount_coarse = 14,  6,  8
+extra_halo = 0
+ntiles_nest_all = 9
+npes_nest_tile = 2, 2, 2, 2, 2, 2, 2, 1, 1
+nest_level = 1, 1, 2
+refine_ratio = 2, 2, 2
+cyclic_nest = 'N'
+! NEST inputs end
+mix_2D_3D = .false.
+test_get_nbr = .false.
+test_edge_update = .false.
+test_cubic_grid_redistribute = .false.
+ensemble_size = 1
+layout_cubic = 0,0
+layout_ensemble = 0,0
+nthreads = 1
+test_boundary = .false.
+layout_tripolar = 0,0
+test_group = .false.
+test_global_sum = .false.
+test_unstruct = .false.
+test_nonsym_edge = .false.
+test_halosize_performance = .false.
+test_adjoint = .false.
+wide_halo = .false.
+/
diff --git a/test_fms/mpp/input_alternative.nml b/test_fms/mpp/input_alternative.nml
new file mode 100644
index 0000000000..61594644eb
--- /dev/null
+++ b/test_fms/mpp/input_alternative.nml
@@ -0,0 +1,67 @@
+&test_mpp_pset_nml
+test_number = <test_num>
+/
+
+&test_mpp_get_ascii_lines_nml
+test_number = <test_num>
+/
+
+
+&test_mpp_domains_nml
+nx=64
+ny=64
+nz=20
+stackmax=20000000
+debug=.false.
+mpes = 3
+check_parallel = .false.
+whalo = 2
+ehalo = 2
+shalo = 2
+nhalo = 2
+x_cyclic_offset = 3
+y_cyclic_offset = -4
+warn_level = "fatal"
+wide_halo_x = 0
+wide_halo_y = 0
+nx_cubic = 20
+ny_cubic = 20
+test_performance = .false.
+test_interface = .false.
+num_fields = 4
+do_sleep = .false.
+num_iter = 2
+! NEST inputs
+test_nest = .false.
+num_nest = 3
+tile_coarse =    2,  3,  7
+tile_fine   =    7 , 8,  9
+istart_coarse =  3,  3,  5
+icount_coarse = 40,  5,  6
+jstart_coarse =  3,  3,  6
+jcount_coarse = 24,  6,  8
+extra_halo = 0
+ntiles_nest_all = 9
+npes_nest_tile = 2, 2, 2, 2, 2, 2, 2, 2, 1
+nest_level = 2, 1, 2
+refine_ratio = 2, 2, 2
+cyclic_nest = 'N'
+! NEST inputs end
+mix_2D_3D = .false.
+test_get_nbr = .false.
+test_edge_update = .false.
+test_cubic_grid_redistribute = .false.
+ensemble_size = 2
+layout_cubic = 0,0
+layout_ensemble = 0,0
+nthreads = 2
+test_boundary = .false.
+layout_tripolar = 0,0
+test_group = .false.
+test_global_sum = .false.
+test_unstruct = .false.
+test_nonsym_edge = .false.
+test_halosize_performance = .false.
+test_adjoint = .false.
+wide_halo = .false.
+/
diff --git a/test_fms/mpp/input_base.nml b/test_fms/mpp/input_base.nml
index 5b77f7e02b..41516a0f3d 100755
--- a/test_fms/mpp/input_base.nml
+++ b/test_fms/mpp/input_base.nml
@@ -55,7 +55,6 @@ test_boundary = .false.
 layout_tripolar = 0,0
 test_group = .false.
 test_global_sum = .false.
-test_subset = .false.
 test_unstruct = .false.
 test_nonsym_edge = .false.
 test_halosize_performance = .false.
diff --git a/test_fms/mpp/input_blank.nml b/test_fms/mpp/input_blank.nml
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_fms/mpp/test_chksum_int.F90 b/test_fms/mpp/test_chksum_int.F90
new file mode 100644
index 0000000000..64a088d481
--- /dev/null
+++ b/test_fms/mpp/test_chksum_int.F90
@@ -0,0 +1,84 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Ryan Mulhall
+!> @email gfdl.climate.model.info@noaa.gov
+!> @brief Test mpp_chksum with mixed precision integers
+!> @description Tests mpp_chksum with 8 and 4 byte integer arrays with
+!> normal and distributed checksums
+program test_chksum_int
+
+  use platform_mod
+  use mpp_mod, only : mpp_init, mpp_exit, mpp_pe, mpp_npes, mpp_root_pe, stdout
+  use mpp_mod, only : mpp_set_stack_size, mpp_sync, mpp_init_test_init_true_only
+  use mpp_mod, only : mpp_transmit, mpp_chksum, ALL_PES
+  use mpp_mod, only : mpp_error, FATAL, mpp_sync_self, NOTE
+  use mpp_io_mod, only: mpp_io_init
+
+  implicit none
+
+  integer                        :: pe, npes, root, out_unit, ierr
+  integer(i8_kind), allocatable  :: data8(:), distData(:),temp(:)
+  integer(i8_kind)               :: res4, res8, resDist
+  integer(i4_kind), allocatable  :: data4(:)
+  real, allocatable              :: rands(:)
+  integer                        :: i, length
+
+  call mpp_init(mpp_init_test_init_true_only)
+  call mpp_io_init()
+  call mpp_set_stack_size(3145746)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  root = mpp_root_pe()
+  out_unit = stdout()
+
+  !> generate random arrays
+  length = 1024
+  allocate(rands(length), data8(length), data4(length), distData(length))
+  call random_number(rands)
+  do i = 1, length
+    data8(i) = rands(i) * huge(data4(1))
+    data4(i) = rands(i) * huge(data4(1))
+    distData(i) = rands(i) * huge(distData(1))
+  end do
+  !>test mixed precision int checksums
+  res4 = mpp_chksum(data4)
+  res8 = mpp_chksum(data8)
+  if(res4.NE.res8) then
+    call mpp_error(FATAL, 'Test mpp_chksum_int: mixed precision checksums do not match')
+  else
+    call mpp_error(NOTE, 'Test mpp_chksum_int: mixed precision checksums match')
+  endif
+  !>test distributed int checksums
+  call mpp_sync()
+  call mpp_transmit( put_data=distData(1), plen=length, to_pe=ALL_PES, &
+                     get_data=distData(1),glen=length, from_pe=root)
+  call mpp_sync_self()
+  allocate(temp(length/npes))
+  temp = distData( pe*(length/npes)+1 : (pe+1)*(length/npes))!> distribute data for pelist
+  resDist = mpp_chksum(distData(1:length), (/pe/))
+  if(resDist.NE.mpp_chksum(temp)) then
+    call mpp_error(FATAL, 'Test mpp_chksum_int: distributed checksums do not match')
+  else
+    call mpp_error(NOTE, 'Test mpp_chksum_int: distributed checksums match')
+  endif
+  deallocate(rands, data8, data4, distData, temp)
+
+  call MPI_FINALIZE(ierr)
+
+end program test_chksum_int
diff --git a/test_fms/mpp/test_chksum_int.sh b/test_fms/mpp/test_chksum_int.sh
new file mode 100755
index 0000000000..a343a39413
--- /dev/null
+++ b/test_fms/mpp/test_chksum_int.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Ed Hartnett 11/29/19
+
+# Set common test settings.
+. ../test_common.sh
+
+skip_test="no"
+
+# Get the number of available CPUs on the system
+if [ $(command -v nproc) ]
+then
+    # Looks like a linux system
+    nProc=$(nproc)
+elif [ $(command -v sysctl) ]
+then
+    # Looks like a Mac OS X system
+    nProc=$(sysctl -n hw.physicalcpu)
+else
+    nProc=-1
+fi
+
+# Do we need to oversubscribe
+if [ ${nProc} -lt 0 ]
+then
+    # Couldn't get the number of CPUs, skip the test.
+    skip_test="skip"
+elif [ $nProc -lt 4 ]
+then
+    # Need to oversubscribe the MPI
+    run_test test_chksum_int 4 $skip_test "true"
+fi
+
+touch input.nml
+run_test test_chksum_int 4 $skip_test
+
diff --git a/test_fms/mpp/test_clock_init b/test_fms/mpp/test_clock_init
new file mode 100755
index 0000000000..094e06db7e
--- /dev/null
+++ b/test_fms/mpp/test_clock_init
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_clock_init - temporary wrapper script for .libs/test_clock_init
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_clock_init program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_clock_init:test_clock_init:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_clock_init:test_clock_init:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_clock_init:test_clock_init:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_clock_init'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_domains_simple b/test_fms/mpp/test_domains_simple
new file mode 100755
index 0000000000..75be87a042
--- /dev/null
+++ b/test_fms/mpp/test_domains_simple
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_domains_simple - temporary wrapper script for .libs/test_domains_simple
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_domains_simple program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_domains_simple:test_domains_simple:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_domains_simple:test_domains_simple:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_domains_simple:test_domains_simple:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_domains_simple'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_domains_utility_mod.F90 b/test_fms/mpp/test_domains_utility_mod.F90
new file mode 100644
index 0000000000..b0d261d8a2
--- /dev/null
+++ b/test_fms/mpp/test_domains_utility_mod.F90
@@ -0,0 +1,276 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Miguel Zuniga
+!> @brief A module with utility auxiliary interface supporting test_mpp_domains.
+!> @note Note: the source code of this module is largely originally in test_mpp_domains.F90.
+module test_domains_utility_mod
+    use mpp_mod,         only : FATAL, WARNING, MPP_DEBUG, NOTE
+    use mpp_mod, only : mpp_error
+    use mpp_domains_mod, only : ZERO, NINETY, MINUS_NINETY
+    use platform_mod, only: r4_kind, r8_kind
+
+  interface fill_coarse_data
+    module procedure fill_coarse_data_r8
+    module procedure fill_coarse_data_r4
+  end interface fill_coarse_data
+
+  interface fill_nest_data
+    module procedure fill_nest_data_r8
+    module procedure fill_nest_data_r4
+  end interface fill_nest_data
+
+  contains
+
+subroutine fill_coarse_data_r8(data, rotate, iadd, jadd, is_c, ie_c, js_c, je_c, nz, isd, jsd, nx, ny, &
+                              ishift, jshift, x_add, y_add, sign1, sign2, x_cyclic, y_cyclic, ieg, jeg)
+    integer, intent(in)    :: rotate, is_c, ie_c, js_c, je_c, nz, isd, jsd, iadd, jadd, nx, ny, ishift, jshift
+    integer, intent(in)    :: sign1, sign2
+    real(kind=r8_kind),    intent(inout) :: data(isd:, jsd:, :)
+    real(kind=r8_kind),    intent(in)    :: x_add, y_add
+    logical, intent(in)    :: x_cyclic, y_cyclic
+    integer, intent(in)    :: ieg, jeg
+    integer :: i, j, k
+
+    select case (rotate)
+    case (ZERO)
+       ! convert the index to be consistent with the fine grid.
+       do k = 1, nz
+          do j = js_c, je_c+jshift
+             do i = is_c, ie_c+ishift
+                data(i,j,k) = (i+iadd)*1.e+6 + (j+jadd)*1.e+3 + k + x_add
+             enddo
+          enddo
+       enddo
+    case (NINETY)
+       ! convert the index to be consistent with the fine grid.
+       do k = 1, nz
+          do j = js_c, je_c+jshift
+             do i = is_c, ie_c+ishift
+                data(i,j,k) = sign1*((nx-j+1+iadd+jshift)*1.e+6 + (i+jadd)*1.e+3 + k + y_add)
+             enddo
+          enddo
+       enddo
+    case (MINUS_NINETY)
+       ! convert the index to be consistent with the fine grid.
+       do k = 1, nz
+          do j = js_c, je_c+jshift
+             do i = is_c, ie_c+ishift
+                data(i,j,k) = sign2*((j+iadd)*1.e+6 + (ny-i+1+jadd+ishift)*1.e+3 + k + y_add)
+             enddo
+          enddo
+       enddo
+    case default
+       call mpp_error(FATAL,"fill_coarse_data: rotate_coarse must be ZERO, NINETY, MINUS_NINETY")
+    end select
+
+    !---handle cyclic condition
+    if(x_cyclic) then
+       if(ie_c+ishift+iadd == ieg) then
+          i = ie_c+ishift
+          do k = 1, nz
+             do j = js_c, je_c+jshift
+                data(i,j,k) = i*1.e+6 + (j+jadd)*1.e+3 + k + x_add
+             enddo
+          enddo
+       endif
+    endif
+
+
+    if(y_cyclic) then
+       if(je_c+jshift+jadd == jeg) then
+          j = je_c+jshift
+          do k = 1, nz
+             do j = js_c, je_c+jshift
+                data(i,j,k) = (i+iadd)*1.e+6 + j*1.e+3 + k + x_add
+             enddo
+          enddo
+       endif
+    endif
+
+  end subroutine fill_coarse_data_r8
+
+
+subroutine fill_coarse_data_r4(data, rotate, iadd, jadd, is_c, ie_c, js_c, je_c, nz, isd, jsd, nx, ny, &
+                              ishift, jshift, x_add, y_add, sign1, sign2, x_cyclic, y_cyclic, ieg, jeg)
+    integer, intent(in)    :: rotate, is_c, ie_c, js_c, je_c, nz, isd, jsd, iadd, jadd, nx, ny, ishift, jshift
+    integer, intent(in)    :: sign1, sign2
+    real(kind=r4_kind),    intent(inout) :: data(isd:, jsd:, :)
+    real(kind=r8_kind),    intent(in)    :: x_add, y_add
+    logical, intent(in)    :: x_cyclic, y_cyclic
+    integer, intent(in)    :: ieg, jeg
+    integer :: i, j, k
+
+    select case (rotate)
+    case (ZERO)
+       ! convert the index to be consistent with the fine grid.
+       do k = 1, nz
+          do j = js_c, je_c+jshift
+             do i = is_c, ie_c+ishift
+                data(i,j,k) = (i+iadd)*1.e+6 + (j+jadd)*1.e+3 + k + x_add
+             enddo
+          enddo
+       enddo
+    case (NINETY)
+       ! convert the index to be consistent with the fine grid.
+       do k = 1, nz
+          do j = js_c, je_c+jshift
+             do i = is_c, ie_c+ishift
+                data(i,j,k) = sign1*((nx-j+1+iadd+jshift)*1.e+6 + (i+jadd)*1.e+3 + k + y_add)
+             enddo
+          enddo
+       enddo
+    case (MINUS_NINETY)
+       ! convert the index to be consistent with the fine grid.
+       do k = 1, nz
+          do j = js_c, je_c+jshift
+             do i = is_c, ie_c+ishift
+                data(i,j,k) = sign2*((j+iadd)*1.e+6 + (ny-i+1+jadd+ishift)*1.e+3 + k + y_add)
+             enddo
+          enddo
+       enddo
+    case default
+       call mpp_error(FATAL,"fill_coarse_data: rotate_coarse must be ZERO, NINETY, MINUS_NINETY")
+    end select
+
+    !---handle cyclic condition
+    if(x_cyclic) then
+       if(ie_c+ishift+iadd == ieg) then
+          i = ie_c+ishift
+          do k = 1, nz
+             do j = js_c, je_c+jshift
+                data(i,j,k) = i*1.e+6 + (j+jadd)*1.e+3 + k + x_add
+             enddo
+          enddo
+       endif
+    endif
+
+
+    if(y_cyclic) then
+       if(je_c+jshift+jadd == jeg) then
+          j = je_c+jshift
+          do k = 1, nz
+             do j = js_c, je_c+jshift
+                data(i,j,k) = (i+iadd)*1.e+6 + j*1.e+3 + k + x_add
+             enddo
+          enddo
+       endif
+    endif
+
+  end subroutine fill_coarse_data_r4
+
+  !###########################################################################################
+
+    subroutine fill_nest_data_r8(buffer, is, ie, js, je, nnest, tile, ishift, jshift, iadd, jadd, rotate, &
+                            isl, iel, jsl, jel, xadd, yadd, sign1, sign2, nx, ny)
+     real(kind=r8_kind), dimension(is:,js:,:), intent(inout) :: buffer
+     integer,                       intent(in) :: is, ie, js, je, nnest
+     integer,                       intent(in) :: ishift, jshift
+     integer, dimension(:),         intent(in) :: tile, iadd, jadd, rotate, isl, iel, jsl, jel
+     real(kind=r8_kind),                          intent(in) :: xadd, yadd
+     integer,                       intent(in) :: sign1, sign2
+     integer,                       intent(in) :: nx, ny
+     integer :: i, j, k, n, nk
+     integer :: ioff, joff
+
+     ioff = 0
+     joff = 0
+     nk = size(buffer,3)
+     do k = 1, nk
+        do n = 1, nnest
+           if(iel(n) == ie) ioff = ishift
+           if(jel(n) == je) joff = jshift
+
+           select case (rotate(n))
+           case(ZERO)
+              do j = jsl(n), jel(n)+joff
+                 do i = isl(n), iel(n)+ioff
+                    buffer(i,j,k) = xadd + tile(n) + (i-iadd(n))*1.e-3 + (j-jadd(n))*1.e-6 + k*1.e-9
+                 enddo
+              enddo
+           case (NINETY)
+              do j = jsl(n), jel(n)+joff
+                 do i = isl(n), iel(n)+ioff
+                    buffer(i,j,k) = sign2*(yadd + tile(n) + (j-jadd(n))*1.e-3 + (nx-i+iadd(n)+1+ioff)*1.e-6 + k*1.e-9)
+                 enddo
+              enddo
+           case (MINUS_NINETY)
+              do j = jsl(n), jel(n)+joff
+                 do i = isl(n), iel(n)+ioff
+                    buffer(i,j,k) = sign1*(yadd + tile(n) + (ny-j+jadd(n)+1+joff)*1.e-3 + (i-iadd(n))*1.e-6 + k*1.e-9)
+                 enddo
+              enddo
+           case default
+              call mpp_error(FATAL,"fill_nest_data: rotate must be ZERO, NINETY, MINUS_NINETY")
+           end select
+        enddo
+     enddo
+
+   end subroutine fill_nest_data_r8
+
+   !###########################################################################################
+
+    subroutine fill_nest_data_r4(buffer, is, ie, js, je, nnest, tile, ishift, jshift, iadd, jadd, rotate, &
+                            isl, iel, jsl, jel, xadd, yadd, sign1, sign2, nx, ny)
+     real(kind=r4_kind), dimension(is:,js:,:), intent(inout) :: buffer
+     integer,                       intent(in) :: is, ie, js, je, nnest
+     integer,                       intent(in) :: ishift, jshift
+     integer, dimension(:),         intent(in) :: tile, iadd, jadd, rotate, isl, iel, jsl, jel
+     real(kind=r8_kind),                          intent(in) :: xadd, yadd
+     integer,                       intent(in) :: sign1, sign2
+     integer,                       intent(in) :: nx, ny
+     integer :: i, j, k, n, nk
+     integer :: ioff, joff
+
+     ioff = 0
+     joff = 0
+     nk = size(buffer,3)
+     do k = 1, nk
+        do n = 1, nnest
+           if(iel(n) == ie) ioff = ishift
+           if(jel(n) == je) joff = jshift
+
+           select case (rotate(n))
+           case(ZERO)
+              do j = jsl(n), jel(n)+joff
+                 do i = isl(n), iel(n)+ioff
+                    buffer(i,j,k) = xadd + tile(n) + (i-iadd(n))*1.e-3 + (j-jadd(n))*1.e-6 + k*1.e-9
+                 enddo
+              enddo
+           case (NINETY)
+              do j = jsl(n), jel(n)+joff
+                 do i = isl(n), iel(n)+ioff
+                    buffer(i,j,k) = sign2*(yadd + tile(n) + (j-jadd(n))*1.e-3 + (nx-i+iadd(n)+1+ioff)*1.e-6 + k*1.e-9)
+                 enddo
+              enddo
+           case (MINUS_NINETY)
+              do j = jsl(n), jel(n)+joff
+                 do i = isl(n), iel(n)+ioff
+                    buffer(i,j,k) = sign1*(yadd + tile(n) + (ny-j+jadd(n)+1+joff)*1.e-3 + (i-iadd(n))*1.e-6 + k*1.e-9)
+                 enddo
+              enddo
+           case default
+              call mpp_error(FATAL,"fill_nest_data: rotate must be ZERO, NINETY, MINUS_NINETY")
+           end select
+        enddo
+     enddo
+
+  end subroutine fill_nest_data_r4
+
+
+end module test_domains_utility_mod
diff --git a/test_fms/mpp/test_global_arrays.F90 b/test_fms/mpp/test_global_arrays.F90
new file mode 100644
index 0000000000..4693e8b73e
--- /dev/null
+++ b/test_fms/mpp/test_global_arrays.F90
@@ -0,0 +1,687 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Ryan Mulhall
+!> @email gfdl.climate.model.info@noaa.gov
+!> @brief Unit tests for mpp global max, min, and sum functions
+!> @description Generates a random data set for both SIZEs of reals and ints
+!> then checks routines with local results received from each pe
+program test_global_arrays
+
+  use platform_mod
+  use mpp_mod,         only: mpp_init, mpp_exit, mpp_pe, mpp_npes, mpp_root_pe
+  use mpp_mod,         only: mpp_set_stack_size, mpp_sync, mpp_sync_self
+  use mpp_mod,         only: mpp_error, FATAL, NOTE, mpp_send, mpp_recv, WARNING
+  use mpp_mod,         only: mpp_init_test_init_true_only, mpp_set_root_pe
+  use mpp_io_mod,      only: mpp_io_init
+  use mpp_domains_mod, only: mpp_domains_init, mpp_define_domains, domain2d
+  use mpp_domains_mod, only: mpp_define_layout, mpp_domains_set_stack_size
+  use mpp_domains_mod, only: mpp_get_global_domain, mpp_global_max
+  use mpp_domains_mod, only: mpp_global_min, mpp_get_data_domain,mpp_get_compute_domain
+  use mpp_domains_mod, only: mpp_domains_exit, mpp_update_domains
+  use mpp_domains_mod, only: mpp_get_domain_shift, mpp_global_sum
+
+  implicit none
+
+  integer, parameter            :: length=64
+  integer                       :: id, pe, npes, root, i, j, icount, jcount
+  integer(i4_kind)              :: maxI4, minI4, ierr, sumI4, sumI4_5d
+  integer(i8_kind)              :: maxI8, minI8, sumI8, sumI8_5d
+  integer(i4_kind), allocatable :: dataI4(:,:), dataI4_5d(:,:,:,:,:), dataI4_shuf(:,:)
+  integer(i8_kind), allocatable :: dataI8(:,:), dataI8_5d(:,:,:,:,:), dataI8_shuf(:,:)
+  real(r4_kind), allocatable    :: dataR4(:,:), dataR4_5d(:,:,:,:,:), dataR4_shuf(:,:)
+  real(r8_kind), allocatable    :: dataR8(:,:), dataR8_5d(:,:,:,:,:), dataR8_shuf(:,:)
+  real, allocatable             :: rands(:)
+  type(domain2D)                :: domain
+  real(r8_kind)                 :: rcoef, maxR8, minR8, sumR8, sumR8_5d
+  real(r4_kind)                 :: maxR4, minR4, sumR4, sumR4_5d
+  integer                       :: isc, iec, jsc, jec
+  integer                       :: isd, ied, jsd, jed
+  character(len=32)             :: strTmp1, strTmp2
+  integer(i4_kind), parameter   :: randmaxI4 = 2048
+  integer(i8_kind), parameter   :: randmaxI8 = 4096
+
+  call mpp_init(mpp_init_test_init_true_only)
+  call mpp_io_init()
+  call mpp_domains_init()
+  call mpp_set_stack_size(3145746)
+  call mpp_domains_set_stack_size(3145746)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  call mpp_set_root_pe(0)
+  root = mpp_root_pe()
+
+  !> define domains and allocate
+  call mpp_define_domains( (/1,length,1,length/), (/4,2/), domain, xhalo=0)
+  call mpp_get_compute_domain(domain, jsc, jec, isc, iec)
+  call mpp_get_data_domain(domain, jsd, jed, isd, ied)
+  allocate(dataI4(jsd:jed, isd:ied),dataI8(jsd:jed, isd:ied), rands(length*length))
+  allocate(dataR4(jsd:jed, isd:ied), dataR8(jsd:jed, isd:ied))
+  allocate(dataR4_shuf(jsd:jed, isd:ied), dataR8_shuf(jsd:jed, isd:ied))
+  allocate(dataI4_shuf(jsd:jed, isd:ied), dataI8_shuf(jsd:jed, isd:ied))
+
+  dataI4 = 0; dataI8 = 0; dataR4 = 0.0; dataR8 = 0.0
+  dataR8_shuf=0.0; dataR4_shuf=0.0;dataI8_shuf=0; dataI4_shuf=0
+
+  !> make random arrays
+  call random_seed()
+  call random_number(rands)
+  do i=isc, iec-1
+    do j=jsc, jec-1
+      rcoef = rands(j + i*length) * 2 -1
+      dataI4(j, i) = int(rcoef * randmaxI4, kind=i4_kind)
+      dataI8(j, i) = int(rcoef * randmaxI8, kind=i8_kind)
+      dataR4(j, i) = real(rcoef, kind=r4_kind)
+      dataR8(j, i) = real(rcoef, kind=r8_kind)
+    end do
+  end do
+
+  !> test global max and mins from each kind
+  call mpp_error(NOTE, "----------Testing 32-bit int mpp_global_max and mpp_global_min----------")
+  call mpp_update_domains(dataI4, domain)
+  maxI4 = mpp_global_max(domain, dataI4)
+  minI4 = mpp_global_min(domain, dataI4)
+  write(strTmp1, *) maxI4
+  write(strTmp2, *) minI4
+  if(.NOT. checkResultInt4((/minI4, maxI4 /))) then
+    call mpp_error(FATAL, "test_global_arrays: invalid 32-bit integer results"// &
+                               NEW_LINE('a')//"Max: "//strTmp1//" Min: "//strTmp2 )
+  endif
+  call mpp_sync()
+  call mpp_error(NOTE, "----------Testing 64-bit int mpp_global_max and mpp_global_min----------")
+  call mpp_update_domains(dataI8, domain)
+  maxI8 = mpp_global_max(domain, dataI8)
+  minI8 = mpp_global_min(domain, dataI8)
+  write(strTmp1, *) maxI8
+  write(strTmp2, *) minI8
+  if(.NOT. checkResultInt8((/minI8, maxI8 /))) then
+    call mpp_error(FATAL, "test_global_arrays: invalid 64-bit integer results"// &
+                               NEW_LINE('a')//"Max: "//strTmp1//" Min: "//strTmp2 )
+  endif
+  call mpp_sync()
+  call mpp_error(NOTE, "----------Testing 32-bit real mpp_global_max and mpp_global_min----------")
+  call mpp_update_domains(dataR4, domain)
+  maxR4 = mpp_global_max(domain, dataR4)
+  minR4 = mpp_global_min(domain, dataR4)
+  write(strTmp1, *) maxR4
+  write(strTmp2, *) minR4
+  if(.NOT. checkResultReal4((/minR4, maxR4 /))) then
+    call mpp_error(FATAL, "test_global_arrays: invalid 32-bit real results"// &
+                               NEW_LINE('a')//"Max: "//strTmp1//" Min: "//strTmp2 )
+  endif
+  call mpp_sync()
+  call mpp_error(NOTE, "----------Testing 64-bit real mpp_global_max and mpp_global_min----------")
+  call mpp_update_domains(dataR8, domain)
+  maxR8 = mpp_global_max(domain, dataR8)
+  minR8 = mpp_global_min(domain, dataR8)
+  write(strTmp1, *) maxR8
+  write(strTmp2, *) minR8
+  if(.NOT. checkResultReal8((/minR8, maxR8 /))) then
+    call mpp_error(FATAL, "test_global_arrays: invalid 64-bit real results"// &
+                               NEW_LINE('a')//"Max: "//strTmp1//" Min: "//strTmp2 )
+  endif
+
+  !> test global sums for each kind
+  call mpp_error(NOTE, "----------Testing 32-bit real mpp_global_sum----------")
+  call mpp_update_domains(dataR4, domain)
+  sumR4 = mpp_global_sum(domain, dataR4)
+  write(strTmp1,*) sumR4
+  if(.NOT. checkSumReal4(sumR4)) then
+    call mpp_error(FATAL, "test_global_arrays: invalid 32-bit real sum"// &
+                               NEW_LINE('a')//"Sum: "// strTmp1 )
+  endif
+  call mpp_error(NOTE, "----------Testing 64-bit real mpp_global_sum----------")
+  call mpp_update_domains(dataR8, domain)
+  sumR8 = mpp_global_sum(domain, dataR8)
+  write(strTmp1,*) sumR8
+  if(.NOT. checkSumReal8(sumR8)) then
+    call mpp_error(FATAL, "test_global_arrays: invalid 64-bit real sum"// &
+                               NEW_LINE('a')//"Sum: "// strTmp1 )
+  endif
+  call mpp_error(NOTE, "----------Testing 32-bit integer mpp_global_sum----------")
+  call mpp_update_domains(dataI4, domain)
+  sumI4 = mpp_global_sum(domain, dataI4)
+  write(strTmp1,*) sumI4
+  if(.NOT. checkSumInt4(sumI4)) then
+    call mpp_error(FATAL, "test_global_arrays: invalid 32-bit integer sum"// &
+                               NEW_LINE('a')//"Sum: "// strTmp1 )
+  endif
+  call mpp_error(NOTE, "----------Testing 64-bit integer mpp_global_sum----------")
+  call mpp_update_domains(dataI8, domain)
+  sumI8 = mpp_global_sum(domain, dataI8)
+  write(strTmp1,*) sumI8
+  if(.NOT. checkSumInt8(sumI8)) then
+    call mpp_error(FATAL, "test_global_arrays: invalid 64-bit integer sum"// &
+                               NEW_LINE('a')//"Sum: "// strTmp1 )
+  endif
+
+  !> shuffle real data ordering and copy into array with 5 ranks
+  dataR4_shuf = dataR4
+  dataR8_shuf = dataR8
+  call shuffleDataR4(dataR4_shuf)
+  call shuffleDataR8(dataR8_shuf)
+  allocate(dataR4_5d(jsd:jed, isd:ied, 1, 1, 1), dataR8_5d(jsd:jed,isd:ied, 1, 1, 1))
+
+  dataR4_5d = 0.0
+  dataR8_5d = 0.0
+
+  do i=isc,iec
+    do j=jsc,jec
+       dataR4_5d(j, i, 1, 1, 1) = dataR4_shuf(j, i)
+       dataR8_5d(j, i, 1, 1, 1) = dataR8_shuf(j, i)
+    end do
+  end do
+  call mpp_sync()
+
+  call mpp_error(NOTE, "----------Testing 32-bit real mpp_global_sum with 5 ranks and reordering----------")
+  call mpp_update_domains(dataR4_5d, domain)
+  sumR4_5d = mpp_global_sum(domain, dataR4_5d)
+
+  ! check that shuffled array results are approximately the same as the original array
+  if(abs((sumR4-sumR4_5d)/sumR4) .gt. 1e-5) then
+    strTmp1 = ""; strTmp2=""
+    write(strTmp1,*) sumR4_5d
+    write(strTmp2,*) sumR4
+    call mpp_error(FATAL,"test_global_arrays: invalid 32-bit real answer after reordering"// &
+                   NEW_LINE('a')//"Sum: "// strTmp1// " ne "//strTmp2)
+  endif
+
+  call mpp_error(NOTE, "----------Testing 64-bit real mpp_global_sum with 5 ranks and reordering----------")
+  call mpp_update_domains(dataR8_5d, domain)
+  sumR8_5d = mpp_global_sum(domain, dataR8_5d)
+  ! check that shuffled array results are approximately the same as the original array
+  !> @note This test fails with gcc 9.3.0
+  if(abs((sumR8-sumR8_5d)/sumR8) .gt. 1e-7) then
+    strTmp1 = ""; strTmp2=""
+    write(strTmp1,*) sumR8_5d
+     write(strTmp2,*) sumR8
+    call mpp_error(FATAL,"test_global_arrays: invalid 64-bit real answer after reordering"// &
+                   NEW_LINE('a')//"Sum: "// strTmp1// " ne "//strTmp2)
+  endif
+
+  !> shuffle integer data ordering and copy into array with 5 ranks
+  dataI4_shuf = dataI4
+  dataI8_shuf = dataI8
+  call shuffleDataI4(dataI4_shuf)
+  call shuffleDataI8(dataI8_shuf)
+  allocate(dataI4_5d(jsd:jed, isd:ied, 1, 1, 1), dataI8_5d(jsd:jed,isd:ied, 1, 1, 1))
+
+  dataI4_5d = 0
+  dataI8_5d = 0
+  do i=isc,iec
+    do j=jsc,jec
+      dataI4_5d(j, i, 1, 1, 1) = dataI4_shuf(j, i)
+      dataI8_5d(j, i, 1, 1, 1) = dataI8_shuf(j, i)
+    end do
+  end do
+  call mpp_sync()
+
+  call mpp_error(NOTE, "----------Testing 32-bit integer mpp_global_sum with 5 ranks and reordering----------")
+  call mpp_update_domains(dataI4_5d, domain)
+  sumI4_5d = mpp_global_sum(domain, dataI4_5d)
+
+  ! check that shuffled array results are approximately the same as the original array
+  if(sumI4 .ne. sumI4_5d) then
+    strTmp1 = ""; strTmp2=""
+    write(strTmp1,*) sumI4_5d
+    write(strTmp2,*) sumI4
+    call mpp_error(FATAL,"test_global_arrays: invalid 32-bit integer answer after reordering"// &
+                   NEW_LINE('a')//"Sum: "// strTmp1// " ne "//strTmp2)
+  endif
+
+  call mpp_error(NOTE, "----------Testing 64-bit integer mpp_global_sum with 5 ranks and reordering----------")
+  call mpp_update_domains(dataI8_5d, domain)
+  sumI8_5d = mpp_global_sum(domain, dataI8_5d)
+
+  ! check that shuffled array results are approximately the same as the original array
+  !> @note This test fails with gcc 9.3.0
+  if(sumI8 .ne. sumI8_5d) then
+    strTmp1 = ""; strTmp2=""
+    write(strTmp1,*) sumI8_5d
+    write(strTmp2,*) sumI8
+    call mpp_error(FATAL,"test_global_arrays: invalid 64-bit integer answer after reordering"// &
+                   NEW_LINE('a')//"Sum: "// strTmp1// " ne "//strTmp2)
+  endif
+
+  deallocate(dataI4, dataI8, dataR4, dataR8, rands, dataI4_5d, dataI8_5d, dataR4_5d, dataR8_5d)
+  deallocate(dataR4_shuf, dataR8_shuf,dataI4_shuf, dataI8_shuf)
+  call mpp_domains_exit()
+  call MPI_FINALIZE(ierr)
+
+  contains
+
+!> true if all pes return the same result and have a lower/higher local max/min
+function checkResultInt4(res)
+  logical                               :: checkResultInt4
+  integer(i4_kind),intent(in)           :: res(2)
+  integer(i4_kind),allocatable          :: tres(:)
+
+  allocate(tres(2))
+  checkResultInt4 = res(2).GE.maxval(dataI4) .and. res(1).LE.minval(dataI4)
+  if(.NOT.checkResultInt4) then
+    return
+  end if
+  !> check that all pes have same results
+  if( pe.EQ.root) then
+    tres = res
+    do i=1, npes-1
+      call mpp_send(tres,2, i)
+    end do
+    checkResultInt4 = .true.
+  else
+    call mpp_recv(tres,2, root)
+    checkResultInt4 = checkResultInt4 .and. res(1) .EQ. tres(1) .and. res(2) .eq. tres(2)
+  end if
+  deallocate(tres)
+end function checkResultInt4
+
+function checkResultInt8(res)
+  logical                               :: checkResultInt8
+  integer(i8_kind),intent(in)           :: res(2)
+  integer(i8_kind),allocatable          :: tres(:)
+
+  allocate(tres(2))
+  checkResultInt8 = res(2).GE.maxval(dataI8) .and. res(1).LE.minval(dataI8)
+  if(.NOT.checkResultInt8) then
+    return
+  end if
+  !> check that all pes have same results
+  if( pe.EQ.root) then
+    tres = res
+    do i=1, npes-1
+      call mpp_send(tres,2, i)
+    end do
+    checkResultInt8 = .true.
+  else
+    call mpp_recv(tres,2, root)
+    checkResultInt8 = checkResultInt8 .and. res(1) .EQ. tres(1) .and. res(2) .eq. tres(2)
+  end if
+  deallocate(tres)
+end function checkResultInt8
+
+function checkResultReal4(res)
+  logical                            :: checkResultReal4
+  real(r4_kind),intent(in)           :: res(2)
+  real(r4_kind),allocatable          :: tres(:)
+
+  allocate(tres(2))
+  checkResultReal4 = res(2).GE.maxval(dataR4) .and. res(1).LE.minval(dataR4)
+  if(.NOT. checkResultReal4) then
+    return
+  end if
+  !> check that all pes have same results
+  if( pe.EQ.root) then
+    tres = res
+    do i=1, npes-1
+      call mpp_send(tres,2, i)
+    end do
+    checkResultReal4 = .true.
+  else
+    call mpp_recv(tres,2, root)
+    checkResultReal4 = checkResultReal4 .and. (abs((res(1)-tres(1))/res(1)) .lt. 1e-5) .and. &
+                       (abs((res(2)-tres(2))/res(2)) .lt. 1e-5)
+  end if
+  deallocate(tres)
+end function checkResultReal4
+
+function checkResultReal8(res)
+  logical                            :: checkResultReal8
+  real(r8_kind),intent(in)           :: res(:)
+  real(r8_kind),allocatable          :: tres(:)
+
+  allocate(tres(2))
+  checkResultReal8 = res(2).GE.maxval(dataR8) .and. res(1).LE.minval(dataR8)
+  if(.NOT.checkResultReal8) then
+    return
+  end if
+  !> check that all pes have same results
+  if( pe.EQ.root) then
+    tres = res
+    do i=1, npes-1
+      call mpp_send(tres,2, i)
+    end do
+    checkResultReal8 = .true.
+  else
+    call mpp_recv(tres,2, root)
+    checkResultReal8 = checkResultReal8 .and. (abs((res(1)-tres(1))/res(1)) .lt. 1e-7) .and. &
+                       (abs((res(2)-tres(2))/res(2)) .lt. 1e-7)
+  end if
+  deallocate(tres)
+end function checkResultReal8
+
+!>@brief Sum local sums from pes and compares with gsum
+!>@return True if gsum is the global sum, false otherwise
+function checkSumReal4(gsum)
+  logical                   :: checkSumReal4
+  real(r4_kind),intent(in)  :: gsum
+  real(r4_kind),allocatable :: recv(:) !> pe's local sum at 1, global sum at 2
+  real(r4_kind)             :: nsum
+  integer                   :: i
+
+  allocate(recv(2))
+  ! root receives and sums local sums from each pe
+  if(pe .eq. root) then
+    nsum = SUM(dataR4)
+    do i=1, npes - 1
+      call mpp_recv(recv, 2, i)
+      nsum = nsum + recv(1)
+      ! also check for matching global sum
+      if( abs((recv(2)-gsum)/gsum) .gt. 1e-5) then
+        checkSumReal4 = .false.
+        deallocate(recv)
+        return
+      endif
+    end do
+    checkSumReal4 = (abs((nsum-gsum)/gsum) .lt. 1e-5)
+  else
+    recv(1) = SUM(dataR4)
+    recv(2) = gsum
+    call mpp_send(recv, 2, root)
+    checkSumReal4 = .true.
+  endif
+  deallocate(recv)
+end function checkSumReal4
+
+!>@brief Sum local sums from pes and compares with gsum
+!>@return True if gsum is the global sum, false otherwise
+function checkSumReal8(gsum)
+  logical                   :: checkSumReal8
+  real(r8_kind),intent(in)  :: gsum
+  real(r8_kind),allocatable :: recv(:) !> pe's local sum at 1, global sum at 2
+  real(r8_kind)             :: nsum
+  integer                   :: i
+
+  allocate(recv(2))
+  ! root receives and sums local sums from each pe
+  if(pe .eq. root) then
+    nsum = SUM(dataR8)
+    do i=1, npes - 1
+      call mpp_recv(recv, 2, i)
+      nsum = nsum + recv(1)
+      ! also check for matching global sum
+      if( abs((recv(2)-gsum)/gsum) .gt. 1e-7 ) then
+        checkSumReal8 = .false.
+        deallocate(recv)
+        return
+      endif
+    end do
+    checkSumReal8 = (abs((nsum-gsum)/gsum) .lt. 1e-7)
+  else
+    recv(1) = SUM(dataR8)
+    recv(2) = gsum
+    call mpp_send(recv, 2, root)
+    checkSumReal8 = .true.
+  endif
+  deallocate(recv)
+end function checkSumReal8
+
+!>@brief Sum local sums from pes and compares with gsum
+!>@return True if gsum is the global sum, false otherwise
+function checkSumInt4(gsum)
+  logical                      :: checkSumInt4
+  integer(i4_kind),intent(in)  :: gsum
+  integer(i4_kind),allocatable :: recv(:) !> pe's local sum at 1, global sum at 2
+  integer(i4_kind)             :: nsum
+  integer                      :: i
+
+  allocate(recv(2))
+  ! root receives and sums local sums from each pe
+  if(pe .eq. root) then
+    nsum = SUM(dataI4)
+    do i=1, npes - 1
+      call mpp_recv(recv, 2, i)
+      nsum = nsum + recv(1)
+      ! also check for matching global sum
+      if( recv(2) .ne. gsum ) then
+        checkSumInt4 = .false.
+        deallocate(recv)
+        return
+      endif
+    end do
+    checkSumInt4 = nsum .eq. gsum
+  else
+    recv(1) = SUM(dataI4)
+    recv(2) = gsum
+    call mpp_send(recv, 2, root)
+    checkSumInt4 = .true.
+  endif
+  deallocate(recv)
+end function checkSumInt4
+
+!>@brief Sum local sums from pes and compares with gsum
+!>@return True if gsum is the global sum, false otherwise
+function checkSumInt8(gsum)
+  logical                      :: checkSumInt8
+  integer(i8_kind),intent(in)  :: gsum
+  integer(i8_kind),allocatable :: recv(:) !> pe's local sum at 1, global sum at 2
+  integer(i8_kind)             :: nsum
+  integer                      :: i
+
+  allocate(recv(2))
+  ! root receives and sums local sums from each pe
+  if(pe .eq. root) then
+    nsum = SUM(dataI8)
+    do i=1, npes - 1
+      call mpp_recv(recv, 2, i)
+      nsum = nsum + recv(1)
+      ! also check for matching global sum
+      if( recv(2) .ne. gsum ) then
+        checkSumInt8 = .false.
+        deallocate(recv)
+        return
+      endif
+    end do
+    checkSumInt8 = nsum .eq. gsum
+  else
+    recv(1) = SUM(dataI8)
+    recv(2) = gsum
+    call mpp_send(recv, 2, root)
+    checkSumInt8 = .true.
+  endif
+  deallocate(recv)
+end function checkSumInt8
+
+!> aggregates data on root and randomizes ordering, then sends partitions back to pes
+subroutine shuffleDataI4(dataI4)
+  integer(i4_kind), intent(INOUT) :: dataI4(:,:)
+  integer(i4_kind), allocatable :: trans(:,:), shuffled(:),tmp
+  integer :: rind
+
+  allocate(trans(SIZE(dataI4,1), SIZE(dataI4,2)))
+  allocate(shuffled(1:length*length))
+
+  if( pe.eq.root) then
+    !> get array partitions and aggregate into 1d
+    shuffled(1:SIZE(dataI4)) = RESHAPE(dataI4, (/SIZE(dataI4)/))
+    do i=1, npes-1
+      call mpp_recv(trans, SIZE(dataI4) , i)
+      shuffled( SIZE(trans)*i+1 : SIZE(trans)*(i+1)) = RESHAPE(trans, (/SIZE(trans)/))
+    end do
+
+    !> shuffle order
+    do i=1, length*length
+      rind = (rands(i) * length * length)
+      if( rind .eq. 0) then
+        rind = 1
+      endif
+      tmp = shuffled(i)
+      shuffled(i) = shuffled(rind)
+      shuffled(rind) = tmp
+    end do
+    trans = 0
+
+    !> send back to pes
+    do i=0, npes-1
+      trans = RESHAPE(shuffled(SIZE(trans)*i + 1:SIZE(trans)*(i+1)), &
+                         (/SIZE(trans,1), SIZE(trans,2) /) )
+      if(i.ne.root) then
+        call mpp_send(trans, SIZE(trans), i)
+      else
+        dataI4 = trans
+      endif
+    end do
+  else
+    call mpp_send(dataI4, SIZE(dataI4), root)
+    call mpp_recv(trans, SIZE(dataI4), root)
+    dataI4 = trans
+  endif
+  deallocate(trans, shuffled)
+end subroutine shuffleDataI4
+
+!> aggregates data on root and randomizes ordering, then sends partitions back to pes
+subroutine shuffleDataI8(dataI8)
+  integer(i8_kind), intent(INOUT) :: dataI8(:,:)
+  integer(i8_kind), allocatable :: trans(:,:), shuffled(:), tmp
+  integer :: rind
+
+  allocate(trans(SIZE(dataI8,1), SIZE(dataI8,2)))
+  allocate(shuffled(1:length*length))
+
+  if( pe.eq.root) then
+    !> get array partitions and aggregate into 1d
+    shuffled(1:SIZE(dataI8)) = RESHAPE(dataI8, (/SIZE(dataI8)/))
+    do i=1, npes-1
+      call mpp_recv(trans, SIZE(dataI8) , i)
+      shuffled( SIZE(trans)*i+1 : SIZE(trans)*(i+1)) = RESHAPE(trans, (/SIZE(trans)/))
+    end do
+
+    !> shuffle order
+    do i=1, length*length
+      rind = (rands(i) * length * length)
+      if( rind .eq. 0) then
+        rind = 1
+      endif
+      tmp = shuffled(i)
+      shuffled(i) = shuffled(rind)
+      shuffled(rind) = tmp
+    end do
+    trans = 0
+
+    !> send back to pes
+    do i=0, npes-1
+      trans = RESHAPE(shuffled(SIZE(trans)*i + 1:SIZE(trans)*(i+1)), &
+                         (/SIZE(trans,1), SIZE(trans,2) /) )
+      if(i.ne.root) then
+        call mpp_send(trans, SIZE(trans), i)
+      else
+        dataI8 = trans
+      endif
+    end do
+  else
+    call mpp_send(dataI8, SIZE(dataI8), root)
+    call mpp_recv(trans, SIZE(dataI8), root)
+    dataI8 = trans
+  endif
+  deallocate(trans, shuffled)
+end subroutine shuffleDataI8
+
+!> aggregates 32-bit real data on root and randomizes ordering, then sends partitions back to pes
+subroutine shuffleDataR4(dataR4)
+  real(r4_kind), intent(INOUT) :: dataR4(:,:)
+  real(r4_kind), allocatable :: trans(:,:), shuffled(:), tmp
+  integer :: rind
+
+  allocate(trans(SIZE(dataR4,1), SIZE(dataR4,2)))
+  allocate(shuffled(1:length*length))
+
+  if( pe.eq.root) then
+    !> get array partitions and aggregate into 1d
+    shuffled(1:SIZE(dataR4)) = RESHAPE(dataR4, (/SIZE(dataR4)/))
+    do i=1, npes-1
+      call mpp_recv(trans, SIZE(dataR4) , i)
+      shuffled( SIZE(trans)*i+1 : SIZE(trans)*(i+1)) = RESHAPE(trans, (/SIZE(trans)/))
+    end do
+
+    !> shuffle order
+    do i=1, length*length
+      rind = (rands(i) * length * length)
+      if( rind .eq. 0) then
+        rind = 1
+      endif
+      tmp = shuffled(i)
+      shuffled(i) = shuffled(rind)
+      shuffled(rind) = tmp
+    end do
+    trans = 0
+
+    !> send back to pes
+    do i=0, npes-1
+      trans = RESHAPE(shuffled(SIZE(trans)*i + 1:SIZE(trans)*(i+1)), &
+                         (/SIZE(trans,1), SIZE(trans,2) /) )
+      if(i.ne.root) then
+        call mpp_send(trans, SIZE(trans), i)
+      else
+        dataR4 = trans
+      endif
+    end do
+  else
+    call mpp_send(dataR4, SIZE(dataR4), root)
+    call mpp_recv(trans, SIZE(dataR4), root)
+    dataR4 = trans
+  endif
+  deallocate(trans, shuffled)
+end subroutine shuffleDataR4
+
+!> aggregates 64-bit real data on root and randomizes ordering, then sends partitions back to pes
+subroutine shuffleDataR8(dataR8)
+  real(r8_kind), intent(INOUT) :: dataR8(:,:)
+  real(r8_kind), allocatable :: trans(:,:), shuffled(:), tmp
+  integer :: rind
+
+  allocate(trans(SIZE(dataR8,1), SIZE(dataR8,2)))
+  allocate(shuffled(1:length*length))
+
+  if( pe.eq.root) then
+    !> get array partitions and aggregate into 1d
+    shuffled(1:SIZE(dataR8)) = RESHAPE(dataR8, (/SIZE(dataR8)/))
+    do i=1, npes-1
+      call mpp_recv(trans, SIZE(dataR8) , i)
+      shuffled( SIZE(trans)*i+1 : SIZE(trans)*(i+1)) = RESHAPE(trans, (/SIZE(trans)/))
+    end do
+
+    !> shuffle order
+    do i=1, length*length
+      rind = (rands(i) * length * length)
+      if( rind .eq. 0) then
+        rind = 1
+      endif
+      tmp = shuffled(i)
+      shuffled(i) = shuffled(rind)
+      shuffled(rind) = tmp
+    end do
+    trans = 0
+
+    !> send back to pes
+    do i=0, npes-1
+      trans = RESHAPE(shuffled(SIZE(trans)*i + 1:SIZE(trans)*(i+1)), &
+                         (/SIZE(trans,1), SIZE(trans,2) /) )
+      if(i.ne.root) then
+        call mpp_send(trans, SIZE(trans), i)
+      else
+        dataR8 = trans
+      endif
+    end do
+  else
+    call mpp_send(dataR8, SIZE(dataR8), root)
+    call mpp_recv(trans, SIZE(dataR8), root)
+    dataR8 = trans
+  endif
+  deallocate(trans, shuffled)
+end subroutine shuffleDataR8
+
+end program test_global_arrays
diff --git a/test_fms/mpp/test_global_arrays.sh b/test_fms/mpp/test_global_arrays.sh
new file mode 100755
index 0000000000..07a754e663
--- /dev/null
+++ b/test_fms/mpp/test_global_arrays.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Ryan Mulhall 2020
+
+# Set common test settings.
+. ../test_common.sh
+
+skip_test="no"
+
+# Get the number of available CPUs on the system
+if [ $(command -v nproc) ]
+then
+    # Looks like a linux system
+    nProc=$(nproc)
+elif [ $(command -v sysctl) ]
+then
+    # Looks like a Mac OS X system
+    nProc=$(sysctl -n hw.physicalcpu)
+else
+    nProc=-1
+fi
+
+# Do we need to oversubscribe
+if [ ${nProc} -lt 0 ]
+then
+    # Couldn't get the number of CPUs, skip the test.
+    skip_test="skip"
+elif [ $nProc -lt 4 ]
+then
+    # Need to oversubscribe the MPI
+    run_test test_global_arrays 8 $skip_test "true"
+fi
+
+run_test test_global_arrays 8 $skip_test
diff --git a/test_fms/mpp/test_minmax.F90 b/test_fms/mpp/test_minmax.F90
new file mode 100644
index 0000000000..e0fecd15bc
--- /dev/null
+++ b/test_fms/mpp/test_minmax.F90
@@ -0,0 +1,349 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Eric Stofferahn
+!> @brief Test mpp_min and mpp_max functions for various precisions of
+!! reals
+program test
+
+  use mpp_mod, only : mpp_init, mpp_pe, mpp_npes, mpp_root_pe, stdout
+  use mpp_mod, only : mpp_clock_id, mpp_clock_begin, mpp_clock_end, mpp_sync
+  use mpp_mod, only : mpp_declare_pelist, mpp_set_current_pelist, mpp_set_stack_size
+  use mpp_mod, only : mpp_broadcast, mpp_sum, mpp_min, mpp_max
+  use mpp_mod, only : mpp_error, FATAL
+  use platform_mod
+
+  implicit none
+
+  integer, parameter              :: n=1048576
+  real(kind=r4_kind), allocatable, dimension(:) :: a4
+  real(kind=r8_kind), allocatable, dimension(:) :: a8
+  integer(kind=i4_kind), allocatable, dimension(:) :: b4
+  integer(kind=i8_kind), allocatable, dimension(:) :: b8
+  integer                         :: id, pe, npes, root, i, out_unit, ierr
+
+  call mpp_init(0)
+  call mpp_set_stack_size(3145746)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  root = mpp_root_pe()
+  out_unit = stdout()
+  allocate( a4(n), a8(n), b4(n), b8(n) )
+
+  if( pe.EQ.root ) print *, '-> Calling test_mpp_max_r4 <-------------------'
+    call test_mpp_max_r4()
+  if( pe.EQ.root ) print *, '-> test_mpp_max_r4: <------------------ Passed!'
+
+  if( npes.GE.2 ) then
+    if( pe.EQ.root ) print *, '-> Calling test_mpp_max_with_pe_r4 <-----------'
+      call test_mpp_max_with_pe_r4()
+    if( pe.EQ.root ) print *, '-> test_mpp_max_with_pe_r4: <---------- Passed!'
+  else
+    if( pe.EQ.root ) print *, '-> test_mpp_max_with_pe_r4: <- (one pe) Skipped'
+  end if
+
+  if( pe.EQ.root ) print *, '-> Calling test_mpp_max_r8 <-------------------'
+    call test_mpp_max_r8()
+  if( pe.EQ.root ) print *, '-> test_mpp_max_r8: <------------------ Passed!'
+
+  if( npes.GE.2 ) then
+    if( pe.EQ.root ) print *, '-> Calling test_mpp_max_with_pe_r8 <-----------'
+      call test_mpp_max_with_pe_r8()
+    if( pe.EQ.root ) print *, '-> test_mpp_max_with_pe_r8: <---------- Passed!'
+  else
+    if( pe.EQ.root ) print *, '-> test_mpp_max_with_pe_r8: <- (one pe) Skipped'
+  end if
+
+  if( pe.EQ.root ) print *, '-> Calling test_mpp_min_r4 <-------------------'
+    call test_mpp_min_r4()
+  if( pe.EQ.root ) print *, '-> test_mpp_min_r4: <------------------ Passed!'
+
+  if( npes.GE.2 ) then
+    if( pe.EQ.root ) print *, '-> Calling test_mpp_min_with_pe_r4 <-----------'
+      call test_mpp_min_with_pe_r4()
+    if( pe.EQ.root ) print *, '-> test_mpp_min_with_pe_r4: <---------- Passed!'
+  else
+    if( pe.EQ.root ) print *, '-> test_mpp_min_with_pe_r4: <- (one pe) Skipped'
+  end if
+
+  if( pe.EQ.root ) print *, '-> Calling test_mpp_min_r8 <-------------------'
+    call test_mpp_min_r8()
+  if( pe.EQ.root ) print *, '-> test_mpp_min_r8: <------------------ Passed!'
+
+  if( npes.GE.2 ) then
+    if( pe.EQ.root ) print *, '-> Calling test_mpp_min_with_pe_r8 <-----------'
+      call test_mpp_min_with_pe_r8()
+    if( pe.EQ.root ) print *, '-> test_mpp_min_with_pe_r8: <---------- Passed!'
+  else
+    if( pe.EQ.root ) print *, '-> test_mpp_min_with_pe_r8: <- (one pe) Skipped'
+  end if
+
+  if( pe.EQ.root ) print *, '-> Calling test_mpp_max_i4 <-------------------'
+    call test_mpp_max_i4()
+  if( pe.EQ.root ) print *, '-> test_mpp_max_i4: <------------------ Passed!'
+
+  if( npes.GE.2 ) then
+    if( pe.EQ.root ) print *, '-> Calling test_mpp_max_with_pe_i4 <-----------'
+      call test_mpp_max_with_pe_i4()
+    if( pe.EQ.root ) print *, '-> test_mpp_max_with_pe_i4: <---------- Passed!'
+  else
+    if( pe.EQ.root ) print *, '-> test_mpp_max_with_pe_i4: <- (one pe) Skipped'
+  end if
+
+  if( pe.EQ.root ) print *, '-> Calling test_mpp_max_i8 <-------------------'
+    call test_mpp_max_i8()
+  if( pe.EQ.root ) print *, '-> test_mpp_max_i8: <------------------ Passed!'
+
+  if( npes.GE.2 ) then
+    if( pe.EQ.root ) print *, '-> Calling test_mpp_max_with_pe_i8 <-----------'
+      call test_mpp_max_with_pe_i8()
+    if( pe.EQ.root ) print *, '-> test_mpp_max_with_pe_i8: <---------- Passed!'
+  else
+    if( pe.EQ.root ) print *, '-> test_mpp_max_with_pe_i8: <- (one pe) Skipped'
+  end if
+
+  if( pe.EQ.root ) print *, '-> Calling test_mpp_min_i4 <-------------------'
+    call test_mpp_min_i4()
+  if( pe.EQ.root ) print *, '-> test_mpp_min_i4: <------------------ Passed!'
+
+  if( npes.GE.2 ) then
+    if( pe.EQ.root ) print *, '-> Calling test_mpp_min_with_pe_i4 <-----------'
+      call test_mpp_min_with_pe_i4()
+    if( pe.EQ.root ) print *, '-> test_mpp_min_with_pe_i4: <---------- Passed!'
+  else
+    if( pe.EQ.root ) print *, '-> test_mpp_min_with_pe_i4: <- (one pe) Skipped'
+  end if
+
+  if( pe.EQ.root ) print *, '-> Calling test_mpp_min_i8 <-------------------'
+    call test_mpp_min_i8()
+  if( pe.EQ.root ) print *, '-> test_mpp_min_i8: <------------------ Passed!'
+
+  if( npes.GE.2 ) then
+    if( pe.EQ.root ) print *, '-> Calling test_mpp_min_with_pe_i8 <-----------'
+      call test_mpp_min_with_pe_i8()
+    if( pe.EQ.root ) print *, '-> test_mpp_min_with_pe_i8: <---------- Passed!'
+  else
+    if( pe.EQ.root ) print *, '-> test_mpp_min_with_pe_i8: <- (one pe) Skipped'
+  end if
+
+  deallocate( a4, a8, b4, b8 )
+  call MPI_FINALIZE(ierr)
+
+contains
+
+  subroutine test_mpp_max_r4
+    a4 = real(pe+1, kind=r4_kind)
+    call mpp_max( a4(1) )
+    if (a4(1).NE.real(npes, kind=r4_kind)) then
+      call mpp_error(FATAL, "The r4 mpp_max function for all npes did not return the appropriate answer")
+    end if
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_max_r4
+
+  subroutine test_mpp_max_with_pe_r4
+    call mpp_declare_pelist( (/(i,i=0,npes-2)/) )
+    if(pe.NE.npes-1) call mpp_set_current_pelist( (/(i,i=0,npes-2)/) )
+    a4 = real(pe+1, kind=r4_kind)
+    if( pe.NE.npes-1 ) then
+      call mpp_max( a4(1), (/(i,i=0,npes-2)/) )
+      if (a4(1).NE.real(npes-1, kind=r4_kind)) then
+        call mpp_error(FATAL, "The r4 mpp_max function for all but the last pe did not return the appropriate answer")
+      end if
+    end if
+    call mpp_set_current_pelist()
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_max_with_pe_r4
+
+  subroutine test_mpp_max_r8
+    a8 = real(pe+1, kind=r8_kind)
+    call mpp_max( a8(1) )
+    if (a8(1).NE.real(npes, kind=r8_kind)) then
+      call mpp_error(FATAL, "The r8 mpp_max function for all npes did not return the appropriate answer")
+    end if
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_max_r8
+
+  subroutine test_mpp_max_with_pe_r8
+    call mpp_declare_pelist( (/(i,i=0,npes-2)/) )
+    if(pe.NE.npes-1) call mpp_set_current_pelist( (/(i,i=0,npes-2)/) )
+    a8 = real(pe+1, kind=r8_kind)
+    if( pe.NE.npes-1 ) then
+      call mpp_max( a8(1), (/(i,i=0,npes-2)/) )
+      if (a8(1).NE.real(npes-1, kind=r8_kind)) then
+        call mpp_error(FATAL, "The r8 mpp_max function for all but the last pe did not return the appropriate answer")
+      end if
+    end if
+    call mpp_set_current_pelist()
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_max_with_pe_r8
+
+  subroutine test_mpp_min_r4
+    a4 = real(pe+1, kind=r4_kind)
+    call mpp_min( a4(1) )
+    if (a4(1).NE.real(1, kind=r4_kind)) then
+      call mpp_error(FATAL, "The r4 mpp_min function for all npes did not return the appropriate answer")
+    end if
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_min_r4
+
+  subroutine test_mpp_min_with_pe_r4
+    call mpp_declare_pelist( (/(i,i=0,npes-2)/) )
+    if(pe.NE.npes-1) call mpp_set_current_pelist( (/(i,i=0,npes-2)/) )
+    a4 = real(pe+1, kind=r4_kind)
+    if( pe.NE.npes-1 ) then
+      call mpp_min( a4(1), (/(i,i=0,npes-2)/) )
+      if (a4(1).NE.real(1, kind=r4_kind)) then
+        call mpp_error(FATAL, "The r4 mpp_min function for all but the last pe did not return the appropriate answer")
+      end if
+    end if
+    call mpp_set_current_pelist()
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_min_with_pe_r4
+
+  subroutine test_mpp_min_r8
+    a8 = real(pe+1, kind=r8_kind)
+    call mpp_min( a8(1) )
+    if (a8(1).NE.real(1, kind=r8_kind)) then
+      call mpp_error(FATAL, "The r8 mpp_min function for all npes did not return the appropriate answer")
+    end if
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_min_r8
+
+  subroutine test_mpp_min_with_pe_r8
+    call mpp_declare_pelist( (/(i,i=0,npes-2)/) )
+    if(pe.NE.npes-1) call mpp_set_current_pelist( (/(i,i=0,npes-2)/) )
+    a8 = real(pe+1, kind=r8_kind)
+    if( pe.NE.npes-1 ) then
+      call mpp_min( a8(1), (/(i,i=0,npes-2)/) )
+      if (a8(1).NE.real(1, kind=r8_kind)) then
+        call mpp_error(FATAL, "The r8 mpp_min function for all but the last pe did not return the appropriate answer")
+      end if
+    end if
+    call mpp_set_current_pelist()
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_min_with_pe_r8
+
+  subroutine test_mpp_max_i4
+    b4 = int(pe+1, kind=i4_kind)
+    call mpp_max( b4(1) )
+    if (b4(1).NE.int(npes, kind=i4_kind)) then
+      call mpp_error(FATAL, "The i4 mpp_max function for all npes did not return the appropriate answer")
+    end if
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_max_i4
+
+  subroutine test_mpp_max_with_pe_i4
+    call mpp_declare_pelist( (/(i,i=0,npes-2)/) )
+    if(pe.NE.npes-1) call mpp_set_current_pelist( (/(i,i=0,npes-2)/) )
+    b4 = int(pe+1, kind=i4_kind)
+    if( pe.NE.npes-1 ) then
+      call mpp_max( b4(1), (/(i,i=0,npes-2)/) )
+      if (b4(1).NE.int(npes-1, kind=i4_kind)) then
+        call mpp_error(FATAL, "The i4 mpp_max function for all but the last pe did not return the appropriate answer")
+      end if
+    end if
+    call mpp_set_current_pelist()
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_max_with_pe_i4
+
+  subroutine test_mpp_max_i8
+    b8 = int(pe+1, kind=i8_kind)
+    call mpp_max( b8(1) )
+    if (b8(1).NE.int(npes, kind=i8_kind)) then
+      call mpp_error(FATAL, "The i8 mpp_max function for all npes did not return the appropriate answer")
+    end if
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_max_i8
+
+  subroutine test_mpp_max_with_pe_i8
+    call mpp_declare_pelist( (/(i,i=0,npes-2)/) )
+    if(pe.NE.npes-1) call mpp_set_current_pelist( (/(i,i=0,npes-2)/) )
+    b8 = int(pe+1, kind=i8_kind)
+    if( pe.NE.npes-1 ) then
+      call mpp_max( b8(1), (/(i,i=0,npes-2)/) )
+      if (b8(1).NE.int(npes-1, kind=i8_kind)) then
+        call mpp_error(FATAL, "The i8 mpp_max function for all but the last pe did not return the appropriate answer")
+      end if
+    end if
+    call mpp_set_current_pelist()
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_max_with_pe_i8
+
+  subroutine test_mpp_min_i4
+    b4 = int(pe+1, kind=i4_kind)
+    call mpp_min( b4(1) )
+    if (b4(1).NE.int(1, kind=i4_kind)) then
+      call mpp_error(FATAL, "The i4 mpp_min function for all npes did not return the appropriate answer")
+    end if
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_min_i4
+
+  subroutine test_mpp_min_with_pe_i4
+    call mpp_declare_pelist( (/(i,i=0,npes-2)/) )
+    if(pe.NE.npes-1) call mpp_set_current_pelist( (/(i,i=0,npes-2)/) )
+    b4 = int(pe+1, kind=i4_kind)
+    if( pe.NE.npes-1 ) then
+      call mpp_min( b4(1), (/(i,i=0,npes-2)/) )
+      if (b4(1).NE.int(1, kind=i4_kind)) then
+        call mpp_error(FATAL, "The i4 mpp_min function for all but the last pe did not return the appropriate answer")
+      end if
+    end if
+    call mpp_set_current_pelist()
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_min_with_pe_i4
+
+  subroutine test_mpp_min_i8
+    b8 = int(pe+1, kind=i8_kind)
+    call mpp_min( b8(1) )
+    if (b8(1).NE.int(1, kind=i8_kind)) then
+      call mpp_error(FATAL, "The i8 mpp_min function for all npes did not return the appropriate answer")
+    end if
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_min_i8
+
+  subroutine test_mpp_min_with_pe_i8
+    call mpp_declare_pelist( (/(i,i=0,npes-2)/) )
+    if(pe.NE.npes-1) call mpp_set_current_pelist( (/(i,i=0,npes-2)/) )
+    b8 = int(pe+1, kind=i8_kind)
+    if( pe.NE.npes-1 ) then
+      call mpp_min( b8(1), (/(i,i=0,npes-2)/) )
+      if (b8(1).NE.int(1, kind=i8_kind)) then
+        call mpp_error(FATAL, "The i8 mpp_min function for all but the last pe did not return the appropriate answer")
+      end if
+    end if
+    call mpp_set_current_pelist()
+    call mpp_sync()
+    call flush(out_unit)
+  end subroutine test_mpp_min_with_pe_i8
+
+end program test
diff --git a/test_fms/mpp/test_minmax.sh b/test_fms/mpp/test_minmax.sh
new file mode 100755
index 0000000000..61b395c146
--- /dev/null
+++ b/test_fms/mpp/test_minmax.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Eric Stofferahn 09/02/20
+
+# Set common test settings.
+. ../test_common.sh
+
+skip_test="no"
+
+# Get the number of available CPUs on the system
+if [ $(command -v nproc) ]
+then
+    # Looks like a linux system
+    nProc=$(nproc)
+elif [ $(command -v sysctl) ]
+then
+    # Looks like a Mac OS X system
+    nProc=$(sysctl -n hw.physicalcpu)
+else
+    nProc=-1
+fi
+
+# Do we need to oversubscribe
+if [ ${nProc} -lt 0 ]
+then
+    # Couldn't get the number of CPUs, skip the test.
+    skip_test="skip"
+elif [ $nProc -lt 4 ]
+then
+    # Need to oversubscribe the MPI
+    run_test test_minmax 4 $skip_test "true"
+fi
+
+touch input.nml
+run_test test_minmax 4 $skip_test
+
diff --git a/test_fms/mpp/test_mpp b/test_fms/mpp/test_mpp
new file mode 100755
index 0000000000..87f1209c9d
--- /dev/null
+++ b/test_fms/mpp/test_mpp
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp - temporary wrapper script for .libs/test_mpp
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp:test_mpp:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp:test_mpp:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp:test_mpp:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp.F90 b/test_fms/mpp/test_mpp.F90
index fdb0098081..ba49d48d97 100644
--- a/test_fms/mpp/test_mpp.F90
+++ b/test_fms/mpp/test_mpp.F90
@@ -20,22 +20,22 @@
 #undef SYSTEM_CLOCK
 #endif
 
-program test   !test various aspects of mpp_mod
-#include <fms_platform.h>
 
+program test   !test various aspects of mpp_mod
   use mpp_mod, only : mpp_init, mpp_exit, mpp_pe, mpp_npes, mpp_root_pe, stdout
   use mpp_mod, only : mpp_clock_id, mpp_clock_begin, mpp_clock_end, mpp_sync
   use mpp_mod, only : mpp_declare_pelist, mpp_set_current_pelist, mpp_set_stack_size
   use mpp_mod, only : mpp_broadcast, mpp_transmit, mpp_sum, mpp_max, mpp_chksum, ALL_PES
   use mpp_mod, only : mpp_gather, mpp_error, FATAL, mpp_sync_self
   use mpp_io_mod, only: mpp_io_init, mpp_flush
+  use platform_mod
 
   implicit none
 
   integer, parameter              :: n=1048576
   real, allocatable, dimension(:) :: a, b, c
   real, allocatable, dimension(:) :: d
-  integer(LONG_KIND) :: locd
+  integer(i8_kind) :: locd
   integer                         :: tick, tick0, ticks_per_sec, id
   integer                         :: pe, npes, root, i, j, k, l, m, n2, istat
   integer                         :: out_unit
@@ -49,20 +49,7 @@ program test   !test various aspects of mpp_mod
   root = mpp_root_pe()
   out_unit = stdout()
 
-  if( pe.EQ.root ) print *, '------------------> Calling test_gather <------------------'
-    call test_gather(npes,pe,root,out_unit)
-    call test_gatherV(npes,pe,root,out_unit)
-    call test_gather2DV(npes,pe,root,out_unit)
-  if( pe.EQ.root ) print *, '------------------> Finished test_gather <------------------'
-
   call SYSTEM_CLOCK( count_rate=ticks_per_sec )
-  if( pe.EQ.root ) print *, '------------------> Calling test_time_transmit <------------------'
-    call test_time_transmit()
-  if( pe.EQ.root ) print *, '------------------> Finished test_time_transmit <------------------'
-
-  if( pe.EQ.root ) print *, '------------------> Calling test_mpp_sum <------------------'
-    call test_mpp_sum()
-  if( pe.EQ.root ) print *, '------------------> Finished test_mpp_sum <------------------'
 
   if( pe.EQ.root ) print *, '------------------> Calling test_mpp_max <------------------'
     call test_mpp_max()
@@ -92,293 +79,9 @@ program test   !test various aspects of mpp_mod
 
 contains
 
-  !***********************************************
-
-  subroutine test_gather(npes,pe,root,out_unit)
-     integer, intent(in) :: npes,pe,root,out_unit
-
-     integer :: pelist(npes)
-     integer :: i
-     real :: rdata(npes)
-     real :: val
-
-     if(npes < 3)then
-       call mpp_error(FATAL, "Test_gather: minimum of 3 ranks required. Not testing gather; too few ranks.")
-     endif
-     write(out_unit,*)
-
-     val = pe
-     rdata = -1.0
-     do i=1,npes
-       pelist(i) = i-1
-     enddo
-
-     call mpp_gather((/val/),rdata)
-     if(pe == root)then
-       do i=1,npes
-        if(INT(rdata(i)) /= pelist(i))then
-           write(6,*) "Gathered data ",INT(rdata(i)), " NE reference ",pelist(i), "at i=",i
-           call mpp_error(FATAL, "Test gather uniform vector with global pelist failed")
-        endif
-       enddo
-     endif
-
-     call mpp_sync()
-     write(out_unit,*) "Test gather uniform vector with global pelist successful"
-
-     rdata = -1.0
-     if(ANY(pe == pelist(2:npes)))call mpp_gather((/val/),rdata(2:npes),pelist(2:npes))
-     if(pe == pelist(2))then
-       do i=2,npes
-        if(INT(rdata(i)) /= pelist(i))then
-           write(6,*) "Gathered data ",INT(rdata(i)), " NE reference ",pelist(i), "at i=",i
-           call mpp_error(FATAL, "Test gather uniform vector with reduced pelist failed")
-        endif
-       enddo
-     endif
-     call mpp_sync()
-     write(out_unit,*) "Test gather uniform vector with reduced pelist successful"
-
-  end subroutine test_gather
-
-
-  subroutine test_gatherV(npes,pe,root,out_unit)
-  implicit none
-     integer, intent(in) :: npes,pe,root,out_unit
-
-     integer :: pelist(npes),rsize(npes)
-     integer :: i,j,k,dsize,ssize
-     real,allocatable :: sdata(:), rdata(:), ref(:)
-
-     if(npes < 3)then
-       call mpp_error(FATAL, "Test_gatherV: minimum of 3 ranks required. Not testing gather; too few ranks.")
-     elseif(npes > 9999)then
-       call mpp_error(FATAL, "Test_gatherV: maximum of 9999 ranks supported. Not testing gatherV; too many ranks.")
-     endif
-     write(out_unit,*)
-
-     ssize = pe+1
-     allocate(sdata(ssize))
-     do i=1,ssize
-       sdata(i) = pe + 0.0001*i
-     enddo
-     do i=1,npes
-       pelist(i) = i-1
-       rsize(i) = i
-     enddo
-
-     dsize = sum(rsize)
-     allocate(rdata(dsize),ref(dsize))
-     rdata = -1.0
-     k=1
-     do j=1,npes
-       do i=1,rsize(j)
-          ref(k) = pelist(j) + 0.0001*i
-          k = k+1
-     enddo;enddo
-
-     call mpp_gather(sdata,ssize,rdata,rsize)
-
-     if(pe == root)then
-       k = 1
-       do j=1,npes
-         do i=1,rsize(j)
-           if(rdata(k) /= ref(k))then
-              write(6,*) "Gathered data ",rdata(k), " NE reference ",ref(k), "at k=",k
-              call mpp_error(FATAL, "Test gatherV global pelist failed")
-           endif
-           k = k+1
-       enddo;enddo
-     endif
-
-     call mpp_sync()
-     write(out_unit,*) "Test gatherV with global pelist successful"
-
-     rdata = -1.0
-     ref(1) = -1.0
-
-     if(ANY(pe == pelist(2:npes)))call mpp_gather(sdata,ssize,rdata(2:),rsize(2:),pelist(2:npes))
-
-     if(pe == pelist(2))then
-       k = 1
-       do j=1,npes
-         do i=1,rsize(j)
-           if(rdata(k) /= ref(k))then
-              write(6,*) "Gathered data ",rdata(k), " NE reference ",ref(k), "at k=",k
-              call mpp_error(FATAL, "Test gatherV with reduced pelist failed")
-           endif
-           k = k+1
-       enddo;enddo
-     endif
-     call mpp_sync()
-
-     write(out_unit,*) "Test gatherV with reduced pelist successful"
-     deallocate(sdata,rdata,ref)
-  end subroutine test_gatherV
-
-subroutine test_gather2DV(npes,pe,root,out_unit)
-  implicit none
-     integer, intent(in) :: npes,pe,root,out_unit
-
-     integer :: pelist(npes),rsize(npes)
-     integer :: pelist2(npes),rsize2(npes)
-     integer :: i,j,k,l,nz,ssize,nelems
-     real,allocatable,dimension(:,:) :: data, cdata, sbuff,rbuff
-     real,allocatable :: ref(:,:)
-     integer, parameter :: KSIZE=10
-
-     real :: sbuff1D(size(sbuff))
-     real :: rbuff1D(size(rbuff))
-     pointer(sptr,sbuff1D); pointer(rptr,rbuff1D)
-
-
-     if(npes < 3)then
-       call mpp_error(FATAL, "Test_gather2DV: minimum of 3 ranks required. Not testing gather; too few ranks.")
-     elseif(npes > 9999)then
-       call mpp_error(FATAL, "Test_gather2DV: maximum of 9999 ranks supported. Not testing gather2DV; too many ranks.")
-       return
-     endif
-     write(out_unit,*)
-
-     ssize = pe+1
-     allocate(data(ssize,KSIZE))
-     do k=1,KSIZE; do i=1,ssize
-       data(i,k) = 10000.0*k + pe + 0.0001*i
-     enddo; enddo
-     do i=1,npes
-       pelist(i) = i-1
-       rsize(i) = i
-     enddo
-
-     nz = KSIZE
-     nelems = sum(rsize(:))
-
-     allocate(rbuff(nz,nelems)); rbuff = -1.0
-     allocate(ref(nelems,nz),cdata(nelems,nz))
-     ref = 0.0; cdata = 0.0
-     if(pe == root)then
-       do k=1,KSIZE
-       l=1
-       do j=1,npes
-         do i=1,rsize(j)
-            ref(l,k) = 10000.0*k + pelist(j) + 0.0001*i
-            l = l+1
-       enddo; enddo;enddo
-     endif
-     allocate(sbuff(nz,ssize))
-     ! this matrix inversion makes for easy gather to the IO root
-     ! and a clear, concise unpack
-     do j=1,ssize
-       do i=1,nz
-         sbuff(i,j) = data(j,i)
-     enddo; enddo
-
-  !  Note that the gatherV implied here is asymmetric; only root needs to know the vector of recv size
-     sptr = LOC(sbuff); rptr = LOC(rbuff)
-     call mpp_gather(sbuff1D,size(sbuff),rbuff1D,nz*rsize(:))
-
-     if(pe == root)then
-        do j=1,nz
-           do i=1,nelems
-             cdata(i,j) = rbuff(j,i)
-        enddo; enddo
-        do j=1,nz
-           do i=1,nelems
-            if(cdata(i,j) /= ref(i,j))then
-               write(6,*) "Gathered data ",cdata(i,j), " NE reference ",ref(i,j), "at i,j=",i,j
-               call mpp_error(FATAL, "Test gather2DV global pelist failed")
-            endif
-       enddo;enddo
-     endif
-
-     call mpp_sync()
-     write(out_unit,*) "Test gather2DV with global pelist successful"
-
-     do i=1,npes
-       pelist2(i) = pelist(npes-i+1)
-       rsize2(i) = rsize(npes-i+1)
-     enddo
-
-     rbuff = -1.0
-     ref = 0.0; cdata = 0.0
-     if(pe == pelist2(1))then
-       do k=1,KSIZE
-       l=1
-       do j=1,npes
-         do i=1,rsize2(j)
-            ref(l,k) = 10000.0*k + pelist2(j) + 0.0001*i
-            l = l+1
-       enddo; enddo;enddo
-     endif
-
-     call mpp_gather(sbuff1D,size(sbuff),rbuff1D,nz*rsize2(:),pelist2)
-
-     if(pe == pelist2(1))then
-        do j=1,nz
-           do i=1,nelems
-             cdata(i,j) = rbuff(j,i)
-        enddo; enddo
-        do j=1,nz
-           do i=1,nelems
-            if(cdata(i,j) /= ref(i,j))then
-               write(6,*) "Gathered data ",cdata(i,j), " NE reference ",ref(i,j), "at i,j=",i,j
-               call mpp_error(FATAL, "Test gather2DV with reversed pelist failed")
-            endif
-       enddo;enddo
-     endif
-     call mpp_sync()
-     write(out_unit,*) "Test gather2DV with reversed pelist successful"
-     deallocate(data,sbuff,rbuff,cdata,ref)
-  end subroutine test_gather2DV
-
-  subroutine test_time_transmit()
-
-  allocate( a(n), b(n) )
-  id = mpp_clock_id( 'Random number' )
-  call mpp_clock_begin(id)
-  call random_number(a)
-  call mpp_clock_end  (id)
-
-  id = mpp_clock_id( 'mpp_transmit' )
-  call mpp_clock_begin(id)
-  !timing is done for cyclical pass (more useful than ping-pong etc)
-  l = n
-  do while( l.GT.0 )
-     !--- mpp_transmit -------------------------------------------------
-     call mpp_sync()
-     call SYSTEM_CLOCK(tick0)
-     do i = 1,npes
-        call mpp_transmit( put_data=a(1), plen=l, to_pe=modulo(pe+npes-i,npes), &
-                           get_data=b(1), glen=l, from_pe=modulo(pe+i,npes) )
-        call mpp_sync_self()
-     end do
-     call mpp_sync()
-     call SYSTEM_CLOCK(tick)
-     dt = real(tick-tick0)/(npes*ticks_per_sec)
-     dt = max( dt, epsilon(dt) )
-     if( pe.EQ.root ) print *, 'MPP_TRANSMIT length, time, bw(Mb/s)=', l, dt, l*8e-6/dt
-     l = l/2
-  end do
-
-  end subroutine test_time_transmit
-
-  subroutine test_mpp_sum()
-
-  a = real(pe+1)
-  call mpp_sync()
-  call SYSTEM_CLOCK(tick0)
-  call mpp_sum(a(1:1000),1000)
-  call SYSTEM_CLOCK(tick)
-  dt = real(tick-tick0)/ticks_per_sec
-  dt = max( dt, epsilon(dt) )
-  if( pe.EQ.root )write( out_unit,'(a,2i6,f9.1,i8,f13.6,f8.2/)' ) &
-       'mpp_sum: pe, npes, sum(pe+1), length, time, bw(Mb/s)=', pe, npes, a(1), n, dt, n*8e-6/dt
-  call mpp_clock_end(id)
-
-  end subroutine test_mpp_sum
-
   subroutine test_mpp_max
 
+  allocate( a(n), b(n) )
   a = real(pe+1)
   print *, 'pe,     pe+1 =', pe, a(1)
   call mpp_max( a(1) )
@@ -445,7 +148,7 @@ subroutine test_mpp_chksum()
   end subroutine test_mpp_chksum
 
   subroutine test_shared_pointers(locd,n)
-    integer(LONG_KIND), intent(in) :: locd
+    integer(i8_kind), intent(in) :: locd
     integer :: n
     real :: dd(n)
     pointer( p, dd )
diff --git a/test_fms/mpp/test_mpp_alltoall.F90 b/test_fms/mpp/test_mpp_alltoall.F90
new file mode 100644
index 0000000000..a0a3b5a0dc
--- /dev/null
+++ b/test_fms/mpp/test_mpp_alltoall.F90
@@ -0,0 +1,1653 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @file
+!! @brief unit test for mpp_alltoall
+!! @author MiKyung Lee
+!! @description
+!! test sending/receiving 1 element so that, for example for npes=4,
+!! process0: [ 0, 1, 2, 3]  --alltoall--> [0,10,20,30]
+!! process1: [10,11,12,13]  --alltoall--> [1,11,21,31]
+!! process2: [20,21,22,23]  --alltoall--> [2,12,22,32]
+!! process3: [30,31,32,33]  --alltoall--> [3,13,23,33]
+!! and test sending/receiving more than 1 element so that, for example, for npes=4 and Nsend=nrecv=2
+!! process0: [ 0, 1, 2, 3, 4, 5, 6, 7]  --alltoall--> [0,1,10,11,20,21,30,31]
+!! process1: [10,11,12,13,14,15,16,17]  --alltoall--> [2,3,12,13,22,23,32,33]
+!! process2: [20,21,22,23,24,25,26,27]  --alltoall--> [4,5,14,15,24,25,34,35]
+!! process3: [30,31,32,33,34,35,36,37]  --alltoall--> [6,7,16,17,26,27,36,37]
+!! https://www.olcf.ornl.gov/wp-content/uploads/2018/06/intro_to_HPC_intro_to_mpi.pdf
+
+program test_mpp_alltoall
+
+  use platform_mod
+  use mpp_mod, only : mpp_init, mpp_init_test_requests_allocated, mpp_init_test_peset_allocated, mpp_error, FATAL
+  use mpp_mod, only : mpp_pe, mpp_npes, mpp_alltoall
+  use mpp_mod, only : mpp_type_create, mpp_type, mpp_byte
+
+  implicit none
+
+  integer :: npes, ierr
+
+    !> initialize MPI
+    call mpp_init( test_level=mpp_init_test_requests_allocated )
+
+    !> get total number of pe's
+    npes = mpp_npes()
+
+    !> call tests
+    call test_mpp_alltoall_real4(npes)
+    call test_mpp_alltoall_real8(npes)
+    call test_mpp_alltoall_int4(npes)
+    call test_mpp_alltoall_int8(npes)
+
+    call test_mpp_alltoallv_real4(npes)
+    call test_mpp_alltoallv_real8(npes)
+    call test_mpp_alltoallv_int4(npes)
+    call test_mpp_alltoallv_int8(npes)
+
+    call test_mpp_alltoallw_real4(npes)
+    call test_mpp_alltoallw_real8(npes)
+    call test_mpp_alltoallw_int4(npes)
+    call test_mpp_alltoallw_int8(npes)
+
+    call MPI_FINALIZE(ierr)
+
+
+  contains
+
+  !>
+  !> test mpp_alltoall for real4
+  !>
+
+  subroutine test_mpp_alltoall_real4(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    real(r4_kind), parameter :: zero = 0., one = 1.
+
+    integer :: pe, ierr, i, ii, N, isend, jsend, irecv, nsend, nrecv
+    real(r4_kind), allocatable :: sbuf(:), rbuf(:)
+
+    !> get pe
+    pe = mpp_pe()
+
+    !> test sending/receiving up to npes elements.  can set up to 9 elements.
+    nsend = npes  ;  nrecv = nsend
+    if ( npes > 9 ) then
+       nsend = 9  ;  nrecv = 9
+    end if
+
+    do isend=1, nsend
+
+       !> allocate sbuf (senddata), rbuf (receivedata)
+       N = isend*npes - 1
+       allocate( sbuf(0:N), rbuf(0:N) )
+
+       !> initialize receiving array
+       rbuf = -one
+
+       !> intialize sending array
+       do i=0, N
+          sbuf(i) = real( 10*pe+i, kind=r4_kind )
+       end do
+
+       !> number of elements to send and receive
+       irecv = isend
+
+       !> call mpp_alltoall to send/receive one element
+       call mpp_alltoall( sbuf, isend, rbuf, irecv )
+
+       !> check
+       ii = 0
+       do i=0, (npes-1)
+          do jsend=0, isend-1
+             if( rbuf(ii) .ne. real( 10*i+isend*pe+jsend, kind=r4_kind ) ) then
+                write(*,'("PE #",i3,"element",i4,"Expected",f6.0,"but received",f6.0)') pe, ii, real(10*i+nsend*pe+jsend), rbuf(ii)
+                call mpp_error(FATAL, 'test_mpp_alltoall failed')
+             end if
+             ii = ii + 1
+          end do
+       end do
+
+       deallocate( sbuf, rbuf )
+
+    end do
+
+  end subroutine test_mpp_alltoall_real4
+
+  !>
+  !> test mpp_alltoall for real8
+  !>
+
+  subroutine test_mpp_alltoall_real8(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    real(r8_kind), parameter :: zero = 0., one=1.
+
+    integer :: pe, ierr, i, ii, N, isend, jsend, irecv, nsend, nrecv
+    real(r8_kind), allocatable :: sbuf(:), rbuf(:)
+
+    !> get pe
+    pe = mpp_pe()
+
+    !> test sending/receiving up to npes elements.  can set up to 9 elements.
+    nsend = npes  ;  nrecv = nsend
+    if ( npes > 9 ) then
+       nsend = 9  ;  nrecv = 9
+    end if
+
+    do isend=1, nsend
+
+       !> allocate sbuf (senddata), rbuf (receivedata)
+       N = isend*npes - 1
+       allocate( sbuf(0:N), rbuf(0:N) )
+
+       !> initialize receiving array
+       rbuf = - one
+
+       !> intialize sending array
+       do i=0, N
+          sbuf(i) = real( 10*pe+i, kind=r8_kind )
+       end do
+
+       !> number of elements to send and receive
+       irecv = isend
+
+       !> call mpp_alltoall to send/receive one element
+       call mpp_alltoall( sbuf, isend, rbuf, irecv )
+
+       !> check
+       ii = 0
+       do i=0, (npes-1)
+          do jsend=0, isend-1
+             if( rbuf(ii) .ne. real( 10*i+isend*pe+jsend, kind=r8_kind ) ) then
+                write(*,'("PE #",i3,"element",i4,"Expected",f6.0,"but received",f6.0)') pe, ii, real(10*i+nsend*pe+jsend), rbuf(ii)
+                call mpp_error(FATAL, 'test_mpp_alltoall failed')
+             end if
+             ii = ii + 1
+          end do
+       end do
+
+       deallocate( sbuf, rbuf )
+
+    end do
+
+  end subroutine test_mpp_alltoall_real8
+
+  !>
+  !> test mpp_alltoall for int4
+  !>
+
+  subroutine test_mpp_alltoall_int4(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    integer(i4_kind), parameter :: zero = 0, one=1
+
+    integer :: pe, ierr, i, ii, N, isend, jsend, irecv, nsend, nrecv
+    integer(i4_kind), allocatable :: sbuf(:), rbuf(:)
+
+    !> get pe
+    pe = mpp_pe()
+
+    !> test sending/receiving up to npes elements.  can set up to 9 elements.
+    nsend = npes  ;  nrecv = nsend
+    if ( npes > 9 ) then
+       nsend = 9  ;  nrecv = 9
+    end if
+
+    do isend=1, nsend
+
+       !> allocate sbuf (senddata), rbuf (receivedata)
+       N = isend*npes - 1
+       allocate( sbuf(0:N), rbuf(0:N) )
+
+       !> initialize receiving array
+       rbuf = - one
+
+       !> intialize sending array
+       do i=0, N
+          sbuf(i) = real( 10*pe+i, kind=i4_kind )
+       end do
+
+       !> number of elements to send and receive
+       irecv = isend
+
+       !> call mpp_alltoall to send/receive one element
+       call mpp_alltoall( sbuf, isend, rbuf, irecv )
+
+       !> check
+       ii = 0
+       do i=0, (npes-1)
+          do jsend=0, isend-1
+             if( rbuf(ii) .ne. real( 10*i+isend*pe+jsend, kind=i4_kind ) ) then
+                write(*,'("PE #",i3,"element",i4,"Expected",i6,"but received",i6)') pe, ii, real(10*i+nsend*pe+jsend), rbuf(ii)
+                call mpp_error(FATAL, 'test_mpp_alltoall failed')
+             end if
+             ii = ii + 1
+          end do
+       end do
+
+       deallocate( sbuf, rbuf )
+
+    end do
+
+  end subroutine test_mpp_alltoall_int4
+
+  !>
+  !> test mpp_alltoall for int8
+  !>
+
+  subroutine test_mpp_alltoall_int8(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    integer(i8_kind), parameter :: zero = 0, one=1
+
+    integer :: pe, ierr, i, ii, N, isend, jsend, irecv, nsend, nrecv
+    integer(i8_kind), allocatable :: sbuf(:), rbuf(:)
+
+    !> get pe
+    pe = mpp_pe()
+
+    !> test sending/receiving up to npes elements.  can set up to 9 elements.
+    nsend = npes  ;  nrecv = nsend
+    if ( npes > 9 ) then
+       nsend = 9  ;  nrecv = 9
+    end if
+
+    do isend=1, nsend
+
+       !> allocate sbuf (senddata), rbuf (receivedata)
+       N = isend*npes - 1
+       allocate( sbuf(0:N), rbuf(0:N) )
+
+       !> initialize receiving array
+       rbuf = - one
+
+       !> intialize sending array
+       do i=0, N
+          sbuf(i) = real( 10*pe+i, kind=i8_kind )
+       end do
+
+       !> number of elements to send and receive
+       irecv = isend
+
+       !> call mpp_alltoall to send/receive one element
+       call mpp_alltoall( sbuf, isend, rbuf, irecv )
+
+       !> check
+       ii = 0
+       do i=0, (npes-1)
+          do jsend=0, isend-1
+             if( rbuf(ii) .ne. real( 10*i+isend*pe+jsend, kind=i8_kind ) ) then
+                write(*,'("PE #",i3,"element",i4,"Expected",i6,"but received",i6)') pe, ii, real(10*i+nsend*pe+jsend), rbuf(ii)
+                call mpp_error(FATAL, 'test_mpp_alltoall failed')
+             end if
+             ii = ii + 1
+          end do
+       end do
+
+       deallocate( sbuf, rbuf )
+
+    end do
+
+  end subroutine test_mpp_alltoall_int8
+
+  !>
+  !> test mpp_alltoallv for real4
+  !>
+
+  subroutine test_mpp_alltoallv_real4(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    real(r4_kind) :: zero = 0., one = 1.
+
+    integer :: pe, ierr, i, ii, N
+    integer, allocatable :: ssize(:), rsize(:), sdispl(:), rdispl(:)
+    real(r4_kind), allocatable :: sbuf(:), rbuf(:)
+
+    !> get pe
+    pe = mpp_pe()
+    N = npes - 1
+
+    allocate( sbuf(0:N),   rbuf(0:N) )
+    allocate( ssize(0:N),  rsize(0:N) )
+    allocate( sdispl(0:N), rdispl(0:N) )
+
+    !>send one, receive one
+    !! process0: [ 0, 1, 2, 3]  --alltoallv--> [0,10,20,30]
+    !! process1: [10,11,12,13]  --alltoallv--> [1,11,21,31]
+    !! process2: [20,21,22,23]  --alltoallv--> [2,12,22,32]
+    !! process3: [30,31,32,33]  --alltoallv--> [3,13,23,33]
+
+    ssize = 1  ;  rsize = 1
+
+    do i=0, N
+       sdispl(i) = i  ;  rdispl(i) = i
+    end do
+
+    do i=0, N
+       sbuf(i) = real( 10*pe+i, kind=r4_kind )
+    end do
+    rbuf = -one
+
+    call mpp_alltoall(sbuf, ssize, sdispl, rbuf, rsize, rdispl )
+
+    !> check
+    do i=0, N
+       if ( rbuf(i).ne.real(10*i+pe, kind=r4_kind) ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+    end do
+
+    !>send one element, receive one
+    !! process0: [ 0, 1, 2, 3, 4, 5, 6, 7]  --alltoallv--> [0,-1,10,-1,20,-1,30,-1]
+    !! process1: [10,11,12,13,14,15,16,17]  --alltoallv--> [2,-1,12,-1,22,-1,32,-1]
+    !! process2: [20,21,22,23,24,25,26,27]  --alltoallv--> [4,-1,14,-1,24,-1,34,-1]
+    !! process3: [30,31,32,33,34,35,36,37]  --alltoallv--> [6,-1,16,-1,26,-1,36,-1]
+
+    ssize = 1  ;  rsize = 1
+
+    deallocate( sbuf, rbuf )
+    allocate( sbuf(0:(2*npes-1)), rbuf(0:(2*npes-1)) )
+
+    do i=0, N
+       sdispl(i) = 2*i  ;  rdispl(i) = 2*i
+    end do
+
+    do i=0, N
+       sbuf(2*i)   = real( 10*pe+2*i, kind=r4_kind )
+       sbuf(2*i+1) = real( 10*pe+2*i+1, kind=r4_kind )
+    end do
+
+    rbuf = real(-1.0, kind=r4_kind )
+
+    call mpp_alltoall(sbuf, ssize, sdispl, rbuf, rsize, rdispl)
+
+    !> check
+    do i=0, N
+       if ( rbuf(2*i).ne.real(10*i+2*pe, kind=r4_kind) ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+       if ( rbuf(2*i+1).ne.-one ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+    end do
+
+  end subroutine test_mpp_alltoallv_real4
+
+  !>
+  !> test mpp_alltoallv for real8
+  !>
+
+  subroutine test_mpp_alltoallv_real8(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    real(r8_kind) :: zero = 0., one = 1.
+
+    integer :: pe, ierr, i, ii, N
+    integer, allocatable :: ssize(:), rsize(:), sdispl(:), rdispl(:)
+    real(r8_kind), allocatable :: sbuf(:), rbuf(:)
+
+    !> get pe
+    pe = mpp_pe()
+    N = npes - 1
+
+    allocate( sbuf(0:N),   rbuf(0:N) )
+    allocate( ssize(0:N),  rsize(0:N) )
+    allocate( sdispl(0:N), rdispl(0:N) )
+
+    !>send one, receive one
+    !! process0: [ 0, 1, 2, 3]  --alltoallv--> [0,10,20,30]
+    !! process1: [10,11,12,13]  --alltoallv--> [1,11,21,31]
+    !! process2: [20,21,22,23]  --alltoallv--> [2,12,22,32]
+    !! process3: [30,31,32,33]  --alltoallv--> [3,13,23,33]
+
+    ssize = 1  ;  rsize = 1
+
+    do i=0, N
+       sdispl(i) = i  ;  rdispl(i) = i
+    end do
+
+    do i=0, N
+       sbuf(i) = real( 10*pe+i, kind=r8_kind )
+    end do
+    rbuf = -one
+
+    call mpp_alltoall(sbuf, ssize, sdispl, rbuf, rsize, rdispl )
+
+    !> check
+    do i=0, N
+       if ( rbuf(i).ne.real(10*i+pe, kind=r8_kind) ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+    end do
+
+    !>send one element, receive one
+    !! process0: [ 0, 1, 2, 3, 4, 5, 6, 7]  --alltoallv--> [0,-1,10,-1,20,-1,30,-1]
+    !! process1: [10,11,12,13,14,15,16,17]  --alltoallv--> [2,-1,12,-1,22,-1,32,-1]
+    !! process2: [20,21,22,23,24,25,26,27]  --alltoallv--> [4,-1,14,-1,24,-1,34,-1]
+    !! process3: [30,31,32,33,34,35,36,37]  --alltoallv--> [6,-1,16,-1,26,-1,36,-1]
+
+    ssize = 1  ;  rsize = 1
+
+    deallocate( sbuf, rbuf )
+    allocate( sbuf(0:(2*npes-1)), rbuf(0:(2*npes-1)) )
+
+    do i=0, N
+       sdispl(i) = 2*i  ;  rdispl(i) = 2*i
+    end do
+
+    do i=0, N
+       sbuf(2*i)   = real( 10*pe+2*i, kind=r8_kind )
+       sbuf(2*i+1) = real( 10*pe+2*i+1, kind=r8_kind )
+    end do
+
+    rbuf = real(-1.0, kind=r8_kind )
+
+    call mpp_alltoall(sbuf, ssize, sdispl, rbuf, rsize, rdispl)
+
+    !> check
+    do i=0, N
+       if ( rbuf(2*i).ne.real(10*i+2*pe, kind=r8_kind) ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+       if ( rbuf(2*i+1).ne.-one ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+    end do
+
+  end subroutine test_mpp_alltoallv_real8
+
+  !>
+  !> test mpp_alltoallv for int4
+  !>
+
+  subroutine test_mpp_alltoallv_int4(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    integer(i4_kind) :: zero = 0, one = 1
+
+    integer :: pe, ierr, i, ii, N
+    integer, allocatable :: ssize(:), rsize(:), sdispl(:), rdispl(:)
+    real(i4_kind), allocatable :: sbuf(:), rbuf(:)
+
+    !> get pe
+    pe = mpp_pe()
+    N = npes - 1
+
+    allocate( sbuf(0:N),   rbuf(0:N) )
+    allocate( ssize(0:N),  rsize(0:N) )
+    allocate( sdispl(0:N), rdispl(0:N) )
+
+    !>send one, receive one
+    !! process0: [ 0, 1, 2, 3]  --alltoallv--> [0,10,20,30]
+    !! process1: [10,11,12,13]  --alltoallv--> [1,11,21,31]
+    !! process2: [20,21,22,23]  --alltoallv--> [2,12,22,32]
+    !! process3: [30,31,32,33]  --alltoallv--> [3,13,23,33]
+
+    ssize = 1  ;  rsize = 1
+
+    do i=0, N
+       sdispl(i) = i  ;  rdispl(i) = i
+    end do
+
+    do i=0, N
+       sbuf(i) = real( 10*pe+i, kind=i4_kind )
+    end do
+    rbuf = -one
+
+    call mpp_alltoall(sbuf, ssize, sdispl, rbuf, rsize, rdispl )
+
+    !> check
+    do i=0, N
+       if ( rbuf(i).ne.real(10*i+pe, kind=i4_kind) ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+    end do
+
+    !>send one element, receive one
+    !! process0: [ 0, 1, 2, 3, 4, 5, 6, 7]  --alltoallv--> [0,-1,10,-1,20,-1,30,-1]
+    !! process1: [10,11,12,13,14,15,16,17]  --alltoallv--> [2,-1,12,-1,22,-1,32,-1]
+    !! process2: [20,21,22,23,24,25,26,27]  --alltoallv--> [4,-1,14,-1,24,-1,34,-1]
+    !! process3: [30,31,32,33,34,35,36,37]  --alltoallv--> [6,-1,16,-1,26,-1,36,-1]
+
+    ssize = 1  ;  rsize = 1
+
+    deallocate( sbuf, rbuf )
+    allocate( sbuf(0:(2*npes-1)), rbuf(0:(2*npes-1)) )
+
+    do i=0, N
+       sdispl(i) = 2*i  ;  rdispl(i) = 2*i
+    end do
+
+    do i=0, N
+       sbuf(2*i)   = real( 10*pe+2*i, kind=i4_kind )
+       sbuf(2*i+1) = real( 10*pe+2*i+1, kind=i4_kind )
+    end do
+
+    rbuf = -one
+
+    call mpp_alltoall(sbuf, ssize, sdispl, rbuf, rsize, rdispl)
+
+    !> check
+    do i=0, N
+       if ( rbuf(2*i).ne.real(10*i+2*pe, kind=i4_kind) ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+       if ( rbuf(2*i+1).ne.-one ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+    end do
+
+  end subroutine test_mpp_alltoallv_int4
+
+  !>
+  !> test mpp_alltoallv for int4
+  !>
+
+  subroutine test_mpp_alltoallv_int8(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    integer(i8_kind) :: zero = 0, one = 1
+
+    integer :: pe, ierr, i, ii, N
+    integer, allocatable :: ssize(:), rsize(:), sdispl(:), rdispl(:)
+    real(i8_kind), allocatable :: sbuf(:), rbuf(:)
+
+    !> get pe
+    pe = mpp_pe()
+    N = npes - 1
+
+    allocate( sbuf(0:N),   rbuf(0:N) )
+    allocate( ssize(0:N),  rsize(0:N) )
+    allocate( sdispl(0:N), rdispl(0:N) )
+
+    !>send one, receive one
+    !! process0: [ 0, 1, 2, 3]  --alltoallv--> [0,10,20,30]
+    !! process1: [10,11,12,13]  --alltoallv--> [1,11,21,31]
+    !! process2: [20,21,22,23]  --alltoallv--> [2,12,22,32]
+    !! process3: [30,31,32,33]  --alltoallv--> [3,13,23,33]
+
+    ssize = 1  ;  rsize = 1
+
+    do i=0, N
+       sdispl(i) = i  ;  rdispl(i) = i
+    end do
+
+    do i=0, N
+       sbuf(i) = real( 10*pe+i, kind=i8_kind )
+    end do
+    rbuf = -one
+
+    call mpp_alltoall(sbuf, ssize, sdispl, rbuf, rsize, rdispl )
+
+    !> check
+    do i=0, N
+       if ( rbuf(i).ne.real(10*i+pe, kind=i8_kind) ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+    end do
+
+    !>send one element, receive one
+    !! process0: [ 0, 1, 2, 3, 4, 5, 6, 7]  --alltoallv--> [0,-1,10,-1,20,-1,30,-1]
+    !! process1: [10,11,12,13,14,15,16,17]  --alltoallv--> [2,-1,12,-1,22,-1,32,-1]
+    !! process2: [20,21,22,23,24,25,26,27]  --alltoallv--> [4,-1,14,-1,24,-1,34,-1]
+    !! process3: [30,31,32,33,34,35,36,37]  --alltoallv--> [6,-1,16,-1,26,-1,36,-1]
+
+    ssize = 1  ;  rsize = 1
+
+    deallocate( sbuf, rbuf )
+    allocate( sbuf(0:(2*npes-1)), rbuf(0:(2*npes-1)) )
+
+    do i=0, N
+       sdispl(i) = 2*i  ;  rdispl(i) = 2*i
+    end do
+
+    do i=0, N
+       sbuf(2*i)   = real( 10*pe+2*i, kind=i8_kind )
+       sbuf(2*i+1) = real( 10*pe+2*i+1, kind=i8_kind )
+    end do
+
+    rbuf = -one
+
+    call mpp_alltoall(sbuf, ssize, sdispl, rbuf, rsize, rdispl)
+
+    !> check
+    do i=0, N
+       if ( rbuf(2*i).ne.real(10*i+2*pe, kind=i8_kind) ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+       if ( rbuf(2*i+1).ne.-one ) call mpp_error( FATAL, 'test mpp_alltoallv fail' )
+    end do
+
+  end subroutine test_mpp_alltoallv_int8
+
+  !>
+  !> test mpp_alltoallw_real4
+  !>
+
+  subroutine test_mpp_alltoallw_real4(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    integer, parameter :: n=9
+    integer, parameter :: byte4 = 4
+    real(r4_kind), parameter :: zero = 0. , one = 1.
+
+    integer :: pe, i, j, jj, jjj, k, kk
+    real(r4_kind) :: answer
+
+    integer :: array_of_subsizes(3), array_of_starts(3)
+    integer :: subsize_i, subsize_j, subsize_k
+    integer :: start_i, start_j, start_k
+    integer :: ssize(0:npes-1), rsize(0:npes-1), sdispl(0:npes-1), rdispl(0:npes-1)
+    real(r4_kind), target :: sbuf(n,n,n), rbuf(n,n,n)
+
+    real(r4_kind), dimension(:), pointer :: psbuf, prbuf
+    type(mpp_type) :: stype(0:npes-1), rtype(0:npes-1)
+
+
+    !> get pe
+    pe = mpp_pe()
+
+    !> assign sbuf and rbuf data arrays
+    do i=1, n
+       do j=1, n
+          do k=1, n
+             sbuf(k,j,i) = real( pe*1000 + i*100 + j*10 + k, kind=r4_kind )
+          end do
+       end do
+    end do
+    rbuf = - one
+
+    !>
+    !> test each PE sending a column of length subsize_k, and starting from sbuf(start_k,start_j,start_i)
+    !>
+
+    !> subarray dimensions
+    subsize_k = 5  ;  subsize_j = 1  ;  subsize_i = 1
+    start_k   = 3  ;  start_j   = 0  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = 2 * i * n * byte4
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = 2 * i * n * byte4
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k, start_j, start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    do i=1, n
+       do j=1, n
+          jj = int( (j-1)/2 )
+          do k=1, n
+             answer=real( jj*1000 + i*100 + (2*pe+1)*10 + k, kind=r4_kind )
+             if ( i.gt.subsize_i )                           answer=-one
+             if ( mod(j,2).eq.0 .or. j.gt.2*npes )           answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k ) answer=-one
+             !if( pe==1 ) write(*,*) i,j,k, rbuf(k,j,i), answer
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with columns' )
+          end do
+       end do
+    end do
+
+    !>
+    !> test each PE sending a row of length subsize_i, and starting from sbuf(start_k,start_j,start_i)
+    !>
+
+    rbuf = - one
+
+    !> subarray dimensions
+    subsize_k = 1  ;  subsize_j = 5  ;  subsize_i = 1
+    start_k   = 0  ;  start_j   = 2  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * byte4
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) =  i * byte4
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k, start_j, start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          do k=1, n
+             answer=real( (k-1)*1000 + i*100 + j*10 + pe+1, kind=r4_kind )
+             if ( i .gt. subsize_i )                         answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j+start_j ) answer=-one
+             if ( k .gt. npes )                              answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rows' )
+          end do
+       end do
+    end do
+
+    !>
+    !> send and receive subarray of rank 2
+    !>
+
+    rbuf = -one
+
+    !> subarray dimensions
+    subsize_k = 2  ;  subsize_j = 2  ;  subsize_i = 1
+    start_k   = 0  ;  start_j   = 1  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * subsize_k * byte4
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = subsize_j * i * n * byte4
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k,start_j,start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          jj  = int( (j-1-start_j)/subsize_j )
+          jjj = mod( (j-1-start_j), subsize_j ) + 1 + start_j
+          do k=1, n
+             answer=real( jj*1000 + i*100 + jjj*10 + subsize_k*pe+k, kind=r4_kind )
+             if ( i .gt. subsize_i )                                answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j*npes+start_j )   answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k)         answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rank 2 subarrays' )
+          end do
+       end do
+    end do
+
+    !>
+    !> send and receive subarray of rank 3
+    !>
+
+    rbuf = -one
+
+    !> subarray dimensions
+    subsize_k = 2  ;  subsize_j = 2  ;  subsize_i = 2
+    start_k   = 1  ;  start_j   = 1  ;  start_i   = 1
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * subsize_k * byte4
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = subsize_j * i * n * byte4
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k,start_j,start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          jj  = int( (j-1-start_j)/subsize_j )
+          jjj = mod( (j-1-start_j), subsize_j ) + 1 + start_j
+          do k=1, n
+             answer=real( jj*1000 + i*100 + jjj*10 + subsize_k*pe+k, kind=r4_kind )
+             if ( i.le.start_i .or. i.gt.subsize_i+start_i )       answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j*npes+start_j )  answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k)        answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rank 3 subarrays' )
+          end do
+       end do
+    end do
+
+  end subroutine test_mpp_alltoallw_real4
+
+  !>
+  !> test mpp_alltoallw_real8
+  !>
+
+  subroutine test_mpp_alltoallw_real8(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    integer, parameter :: n=9
+    integer, parameter :: byte8 = 8
+    real(r8_kind), parameter :: zero = 0. , one = 1.
+
+    integer :: pe, i, j, jj, jjj, k, kk
+    real(r8_kind) :: answer
+
+    integer :: array_of_subsizes(3), array_of_starts(3)
+    integer :: subsize_i, subsize_j, subsize_k
+    integer :: start_i, start_j, start_k
+    integer :: ssize(0:npes-1), rsize(0:npes-1), sdispl(0:npes-1), rdispl(0:npes-1)
+    real(r8_kind), target :: sbuf(n,n,n), rbuf(n,n,n)
+
+    real(r8_kind), dimension(:), pointer :: psbuf, prbuf
+    type(mpp_type) :: stype(0:npes-1), rtype(0:npes-1)
+
+
+    !> get pe
+    pe = mpp_pe()
+
+    !> assign sbuf and rbuf data arrays
+    do i=1, n
+       do j=1, n
+          do k=1, n
+             sbuf(k,j,i) = real( pe*1000 + i*100 + j*10 + k, kind=r8_kind )
+          end do
+       end do
+    end do
+    rbuf = - one
+
+    !>
+    !> test each PE sending a column of length subsize_k, and starting from sbuf(start_k,start_j,start_i)
+    !>
+
+    !> subarray dimensions
+    subsize_k = 5  ;  subsize_j = 1  ;  subsize_i = 1
+    start_k   = 3  ;  start_j   = 0  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = 2 * i * n * byte8
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = 2 * i * n * byte8
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k, start_j, start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    do i=1, n
+       do j=1, n
+          jj = int( (j-1)/2 )
+          do k=1, n
+             answer=real( jj*1000 + i*100 + (2*pe+1)*10 + k, kind=r8_kind )
+             if ( i.gt.subsize_i )                           answer=-one
+             if ( mod(j,2).eq.0 .or. j.gt.2*npes )           answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k ) answer=-one
+             !if( pe==1 ) write(*,*) i,j,k, rbuf(k,j,i), answer
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with columns' )
+          end do
+       end do
+    end do
+
+    !>
+    !> test each PE sending a row of length subsize_i, and starting from sbuf(start_k,start_j,start_i)
+    !>
+
+    rbuf = - one
+
+    !> subarray dimensions
+    subsize_k = 1  ;  subsize_j = 5  ;  subsize_i = 1
+    start_k   = 0  ;  start_j   = 2  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * byte8
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) =  i * byte8
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k, start_j, start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          do k=1, n
+             answer=real( (k-1)*1000 + i*100 + j*10 + pe+1, kind=r8_kind )
+             if ( i .gt. subsize_i )                         answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j+start_j ) answer=-one
+             if ( k .gt. npes )                              answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rows' )
+          end do
+       end do
+    end do
+
+    !>
+    !> send and receive subarray of rank 2
+    !>
+
+    rbuf = -one
+
+    !> subarray dimensions
+    subsize_k = 2  ;  subsize_j = 2  ;  subsize_i = 1
+    start_k   = 0  ;  start_j   = 1  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * subsize_k * byte8
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = subsize_j * i * n * byte8
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k,start_j,start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          jj  = int( (j-1-start_j)/subsize_j )
+          jjj = mod( (j-1-start_j), subsize_j ) + 1 + start_j
+          do k=1, n
+             answer=real( jj*1000 + i*100 + jjj*10 + subsize_k*pe+k, kind=r8_kind )
+             if ( i .gt. subsize_i )                                answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j*npes+start_j )   answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k)         answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rank 2 subarrays' )
+          end do
+       end do
+    end do
+
+    !>
+    !> send and receive subarray of rank 3
+    !>
+
+    rbuf = -one
+
+    !> subarray dimensions
+    subsize_k = 2  ;  subsize_j = 2  ;  subsize_i = 2
+    start_k   = 1  ;  start_j   = 1  ;  start_i   = 1
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * subsize_k * byte8
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = subsize_j * i * n * byte8
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k,start_j,start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          jj  = int( (j-1-start_j)/subsize_j )
+          jjj = mod( (j-1-start_j), subsize_j ) + 1 + start_j
+          do k=1, n
+             answer=real( jj*1000 + i*100 + jjj*10 + subsize_k*pe+k, kind=r8_kind )
+             if ( i.le.start_i .or. i.gt.subsize_i+start_i )       answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j*npes+start_j )  answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k)        answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rank 3 subarrays' )
+          end do
+       end do
+    end do
+
+  end subroutine test_mpp_alltoallw_real8
+
+  !>
+  !> test mpp_alltoallw_int4
+  !>
+
+  subroutine test_mpp_alltoallw_int4(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    integer, parameter :: n=9
+    integer, parameter :: byte4 = 4
+    integer(i4_kind), parameter :: zero = 0 , one = 1
+
+    integer :: pe, i, j, jj, jjj, k, kk
+    integer(i4_kind) :: answer
+
+    integer :: array_of_subsizes(3), array_of_starts(3)
+    integer :: subsize_i, subsize_j, subsize_k
+    integer :: start_i, start_j, start_k
+    integer :: ssize(0:npes-1), rsize(0:npes-1), sdispl(0:npes-1), rdispl(0:npes-1)
+    integer(i4_kind), target :: sbuf(n,n,n), rbuf(n,n,n)
+
+    integer(i4_kind), dimension(:), pointer :: psbuf, prbuf
+    type(mpp_type) :: stype(0:npes-1), rtype(0:npes-1)
+
+
+    !> get pe
+    pe = mpp_pe()
+
+    !> assign sbuf and rbuf data arrays
+    do i=1, n
+       do j=1, n
+          do k=1, n
+             sbuf(k,j,i) = int( pe*1000 + i*100 + j*10 + k, kind=i4_kind )
+          end do
+       end do
+    end do
+    rbuf = - one
+
+    !>
+    !> test each PE sending a column of length subsize_k, and starting from sbuf(start_k,start_j,start_i)
+    !>
+
+    !> subarray dimensions
+    subsize_k = 5  ;  subsize_j = 1  ;  subsize_i = 1
+    start_k   = 3  ;  start_j   = 0  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = 2 * i * n * byte4
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = 2 * i * n * byte4
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k, start_j, start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    do i=1, n
+       do j=1, n
+          jj = int( (j-1)/2 )
+          do k=1, n
+             answer=real( jj*1000 + i*100 + (2*pe+1)*10 + k, kind=i4_kind )
+             if ( i.gt.subsize_i )                           answer=-one
+             if ( mod(j,2).eq.0 .or. j.gt.2*npes )           answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k ) answer=-one
+             !if( pe==1 ) write(*,*) i,j,k, rbuf(k,j,i), answer
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with columns' )
+          end do
+       end do
+    end do
+
+    !>
+    !> test each PE sending a row of length subsize_i, and starting from sbuf(start_k,start_j,start_i)
+    !>
+
+    rbuf = - one
+
+    !> subarray dimensions
+    subsize_k = 1  ;  subsize_j = 5  ;  subsize_i = 1
+    start_k   = 0  ;  start_j   = 2  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * byte4
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) =  i * byte4
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k, start_j, start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          do k=1, n
+             answer=int( (k-1)*1000 + i*100 + j*10 + pe+1, kind=i4_kind )
+             if ( i .gt. subsize_i )                         answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j+start_j ) answer=-one
+             if ( k .gt. npes )                              answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rows' )
+          end do
+       end do
+    end do
+
+    !>
+    !> send and receive subarray of rank 2
+    !>
+
+    rbuf = -one
+
+    !> subarray dimensions
+    subsize_k = 2  ;  subsize_j = 2  ;  subsize_i = 1
+    start_k   = 0  ;  start_j   = 1  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * subsize_k * byte4
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = subsize_j * i * n * byte4
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k,start_j,start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          jj  = int( (j-1-start_j)/subsize_j )
+          jjj = mod( (j-1-start_j), subsize_j ) + 1 + start_j
+          do k=1, n
+             answer=int( jj*1000 + i*100 + jjj*10 + subsize_k*pe+k, kind=i4_kind )
+             if ( i .gt. subsize_i )                                answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j*npes+start_j )   answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k)         answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rank 2 subarrays' )
+          end do
+       end do
+    end do
+
+    !>
+    !> send and receive subarray of rank 3
+    !>
+
+    rbuf = -one
+
+    !> subarray dimensions
+    subsize_k = 2  ;  subsize_j = 2  ;  subsize_i = 2
+    start_k   = 1  ;  start_j   = 1  ;  start_i   = 1
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * subsize_k * byte4
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = subsize_j * i * n * byte4
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k,start_j,start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          jj  = int( (j-1-start_j)/subsize_j )
+          jjj = mod( (j-1-start_j), subsize_j ) + 1 + start_j
+          do k=1, n
+             answer=int( jj*1000 + i*100 + jjj*10 + subsize_k*pe+k, kind=i4_kind )
+             if ( i.le.start_i .or. i.gt.subsize_i+start_i )       answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j*npes+start_j )  answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k)        answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rank 3 subarrays' )
+          end do
+       end do
+    end do
+
+  end subroutine test_mpp_alltoallw_int4
+
+  !>
+  !> test mpp_alltoallw_int8
+  !>
+
+  subroutine test_mpp_alltoallw_int8(npes)
+
+    implicit none
+
+    integer, intent(in) :: npes
+
+    integer, parameter :: n=9
+    integer, parameter :: byte8 = 8
+    integer(i8_kind), parameter :: zero = 0 , one = 1
+
+    integer :: pe, i, j, jj, jjj, k, kk
+    integer(i8_kind) :: answer
+
+    integer :: array_of_subsizes(3), array_of_starts(3)
+    integer :: subsize_i, subsize_j, subsize_k
+    integer :: start_i, start_j, start_k
+    integer :: ssize(0:npes-1), rsize(0:npes-1), sdispl(0:npes-1), rdispl(0:npes-1)
+    integer(i8_kind), target :: sbuf(n,n,n), rbuf(n,n,n)
+
+    integer(i8_kind), dimension(:), pointer :: psbuf, prbuf
+    type(mpp_type) :: stype(0:npes-1), rtype(0:npes-1)
+
+
+    !> get pe
+    pe = mpp_pe()
+
+    !> assign sbuf and rbuf data arrays
+    do i=1, n
+       do j=1, n
+          do k=1, n
+             sbuf(k,j,i) = int( pe*1000 + i*100 + j*10 + k, kind=i8_kind )
+          end do
+       end do
+    end do
+    rbuf = - one
+
+    !>
+    !> test each PE sending a column of length subsize_k, and starting from sbuf(start_k,start_j,start_i)
+    !>
+
+    !> subarray dimensions
+    subsize_k = 5  ;  subsize_j = 1  ;  subsize_i = 1
+    start_k   = 3  ;  start_j   = 0  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = 2 * i * n * byte8
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = 2 * i * n * byte8
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k, start_j, start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    do i=1, n
+       do j=1, n
+          jj = int( (j-1)/2 )
+          do k=1, n
+             answer=real( jj*1000 + i*100 + (2*pe+1)*10 + k, kind=i8_kind )
+             if ( i.gt.subsize_i )                           answer=-one
+             if ( mod(j,2).eq.0 .or. j.gt.2*npes )           answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k ) answer=-one
+             !if( pe==1 ) write(*,*) i,j,k, rbuf(k,j,i), answer
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with columns' )
+          end do
+       end do
+    end do
+
+    !>
+    !> test each PE sending a row of length subsize_i, and starting from sbuf(start_k,start_j,start_i)
+    !>
+
+    rbuf = - one
+
+    !> subarray dimensions
+    subsize_k = 1  ;  subsize_j = 5  ;  subsize_i = 1
+    start_k   = 0  ;  start_j   = 2  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * byte8
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) =  i * byte8
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k, start_j, start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          do k=1, n
+             answer=int( (k-1)*1000 + i*100 + j*10 + pe+1, kind=i8_kind )
+             if ( i .gt. subsize_i )                         answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j+start_j ) answer=-one
+             if ( k .gt. npes )                              answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rows' )
+          end do
+       end do
+    end do
+
+    !>
+    !> send and receive subarray of rank 2
+    !>
+
+    rbuf = -one
+
+    !> subarray dimensions
+    subsize_k = 2  ;  subsize_j = 2  ;  subsize_i = 1
+    start_k   = 0  ;  start_j   = 1  ;  start_i   = 0
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * subsize_k * byte8
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = subsize_j * i * n * byte8
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k,start_j,start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          jj  = int( (j-1-start_j)/subsize_j )
+          jjj = mod( (j-1-start_j), subsize_j ) + 1 + start_j
+          do k=1, n
+             answer=int( jj*1000 + i*100 + jjj*10 + subsize_k*pe+k, kind=i8_kind )
+             if ( i .gt. subsize_i )                                answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j*npes+start_j )   answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k)         answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rank 2 subarrays' )
+          end do
+       end do
+    end do
+
+    !>
+    !> send and receive subarray of rank 3
+    !>
+
+    rbuf = -one
+
+    !> subarray dimensions
+    subsize_k = 2  ;  subsize_j = 2  ;  subsize_i = 2
+    start_k   = 1  ;  start_j   = 1  ;  start_i   = 1
+
+    !> send one group to each PE
+    ssize = 1
+    do i=0, npes-1
+       sdispl(i) = i * subsize_k * byte8
+    end do
+
+    !> receive one group from each PE
+    rsize = 1
+    do i=0, npes-1
+       rdispl(i) = subsize_j * i * n * byte8
+    end do
+
+    !> subarrays (portion of data) in sbuf/rbuf to send/receive
+    array_of_subsizes=(/subsize_k, subsize_j, subsize_i/)
+    array_of_starts=(/start_k,start_j,start_i/)
+
+    !> initialize mpp_type datatype
+    stype(:) = mpp_byte  ;  rtype(:) = mpp_byte
+
+    !> create mpp_type datatype
+    do i=0, npes-1
+       call mpp_type_create( sbuf, array_of_subsizes, array_of_starts, stype(i) )
+       call mpp_type_create( rbuf, array_of_subsizes, array_of_starts, rtype(i) )
+    end do
+
+    !> mpp_alltoallW
+    psbuf(1:size(sbuf)) => sbuf  ;  prbuf(1:size(rbuf)) => rbuf
+    call mpp_alltoall( psbuf, ssize, sdispl, stype, prbuf, rsize, rdispl, stype )
+
+    !> check
+    do i=1, n
+       do j=1, n
+          jj  = int( (j-1-start_j)/subsize_j )
+          jjj = mod( (j-1-start_j), subsize_j ) + 1 + start_j
+          do k=1, n
+             answer=int( jj*1000 + i*100 + jjj*10 + subsize_k*pe+k, kind=i8_kind )
+             if ( i.le.start_i .or. i.gt.subsize_i+start_i )       answer=-one
+             if ( j.le.start_j .or. j.gt.subsize_j*npes+start_j )  answer=-one
+             if ( k.le.start_k .or. k.gt.subsize_k+start_k)        answer=-one
+             if ( rbuf(k,j,i) .ne. answer ) call mpp_error( FATAL, 'error in MPP_ALLTOALLW with rank 3 subarrays' )
+          end do
+       end do
+    end do
+
+  end subroutine test_mpp_alltoallw_int8
+
+end program test_mpp_alltoall
diff --git a/test_fms/mpp/test_mpp_alltoall.sh b/test_fms/mpp/test_mpp_alltoall.sh
new file mode 100755
index 0000000000..c5bc0c3c45
--- /dev/null
+++ b/test_fms/mpp/test_mpp_alltoall.sh
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Tom Robinson 04/21/2020
+
+# Set common test settings.
+. ../test_common.sh
+
+
+# Run the test for one processor
+run_test test_mpp_alltoall 4
diff --git a/test_fms/mpp/test_mpp_broadcast b/test_fms/mpp/test_mpp_broadcast
new file mode 100755
index 0000000000..3856b5b501
--- /dev/null
+++ b/test_fms/mpp/test_mpp_broadcast
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_broadcast - temporary wrapper script for .libs/test_mpp_broadcast
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_broadcast program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_broadcast:test_mpp_broadcast:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_broadcast:test_mpp_broadcast:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_broadcast:test_mpp_broadcast:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_broadcast'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_broadcast.F90 b/test_fms/mpp/test_mpp_broadcast.F90
index 587ef275a3..74f7d28886 100644
--- a/test_fms/mpp/test_mpp_broadcast.F90
+++ b/test_fms/mpp/test_mpp_broadcast.F90
@@ -16,84 +16,1088 @@
 !* You should have received a copy of the GNU Lesser General Public
 !* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
 !***********************************************************************
+!> @file
+!! @brief unit test for mpp_broadcast
+!! @email gfdl.climate.model.info@noaa.gov
+!! @description This program tests the subroutines
+!! mpp_broadcast_real4_2d to mpp_broadcast_real4_5d,
+!! mpp_broadcast_real8_2d to mpp_broadcast_real8_5d,
+!! mpp_broadcast_i4_2d to mpp_broadcast_i4_5d,
+!! mpp_broadcast_i8_2d to mpp_broadcast_i8_5d, and mpp_broadcast_char
 
 program test_mpp_broadcast
 
+  use platform_mod
   use mpp_mod, only : mpp_init, mpp_init_test_peset_allocated, mpp_pe, mpp_npes, mpp_root_pe
   use mpp_mod, only : mpp_error, mpp_broadcast, FATAL
 
+  implicit none
+
   integer :: ierr !< Used by MPI_FINALIZE
 
   call mpp_init(test_level=mpp_init_test_peset_allocated)
 
-  call test_broadcast_2D()
+  !> tests mpp_broadcast_*D_I4
+  call test_broadcast_2D_I4()
+  call test_broadcast_3D_I4()
+  call test_broadcast_4D_I4()
+  call test_broadcast_5D_I4()
+  !> tests mpp_broadcast_*D_I8
+  call test_broadcast_2D_I8()
+  call test_broadcast_3D_I8()
+  call test_broadcast_4D_I8()
+  call test_broadcast_5D_I8()
+  !> tests mpp_broadcast_*D_R4
+  call test_broadcast_2D_R4()
+  call test_broadcast_3D_R4()
+  call test_broadcast_4D_R4()
+  call test_broadcast_5D_R4()
+  !> tests mpp_broadcast_*D_R8
+  call test_broadcast_2D_R8()
+  call test_broadcast_3D_R8()
+  call test_broadcast_4D_R8()
+  call test_broadcast_5D_R8()
+  !> tests mpp_broadcast_char
   call test_broadcast_char()
 
   call MPI_FINALIZE(ierr)
+
 contains
+!>
+!> test mpp_broadcast_2d_i4
+!>
+subroutine test_broadcast_2D_I4()
 
-subroutine test_broadcast_2D()
-  integer, parameter :: ARRAYSIZE = 3
-  integer :: n, m, p
-  real :: r(3,3), k(3,3)
-
-  p=0;
-  do n = 1, ARRAYSIZE
-    do m = 1, ARRAYSIZE
-       p = p + 1
-       k(n, m) = p
-       r(n, m) = k(n, m)
-    enddo
+  implicit none
+
+  integer, parameter :: NN = 3
+  integer(i4_kind), parameter  :: zero = 0, one=1
+
+  integer :: n, m
+  integer(i4_kind) :: p, r(NN,NN), k(NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        p = p + one
+        k(m,n) = p
+     enddo
   enddo
 
-  if(mpp_pe() .NE. mpp_root_pe()) then
-    do n =1, ARRAYSIZE
-       r(:, n) = 0
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           if(r(m,n) .NE. k(m,n)) call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           if(r(m,n) == k(m,n)) call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+        enddo
+     enddo
+ endif
+
+ call mpp_broadcast(r, NN*NN, mpp_root_pe())
+
+ !--- after broadcast, r and k should be the same
+ do n=1, NN
+    do m=1, NN
+       if(r(m,n) .NE. k(m,n)) call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
     enddo
+ enddo
+
+end subroutine test_broadcast_2D_I4
+!>
+!> test mpp_broadcast_3d_i4
+!>
+subroutine test_broadcast_3D_i4()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  integer(i4_kind), parameter  :: zero = 0, one=1
+
+  integer :: i, n, m
+  integer(i4_kind) :: p, r(NN,NN,NN), k(NN,NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           p = p + one
+           k(i,m,n) = p
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              if(r(i,m,n) .NE. k(i,m,n)) call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              if(r(i,m,n) == k(i,m,n)) call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+           enddo
+        enddo
+     enddo
   endif
 
-  !--- comparing array m and n. m and n are supposed to be different on pe other
+  call mpp_broadcast(r, NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           if(r(i,m,n) .NE. k(i,m,n)) call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_3D_i4
+!>
+!> test mpp_broadcoast_4D_i4
+!>
+subroutine test_broadcast_4D_i4()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  integer(i4_kind), parameter  :: zero = 0, one=1
+
+  integer :: i, j, n, m
+  integer(i4_kind) :: p, r(NN,NN,NN,NN), k(NN,NN,NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              p = p + one
+              k(j,i,m,n) = p
+           enddo
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 if(r(j,i,m,n) .NE. k(j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 if(r(j,i,m,n) == k(j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              if(r(j,i,m,n) .NE. k(j,i,m,n)) &
+                   call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+           enddo
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_4D_i4
+!>
+!> test mpp_broadcast_5d_i4
+!>
+subroutine test_broadcast_5D_I4()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  integer(i4_kind), parameter  :: zero = 0, one=1
+
+  integer :: i, j, l, n, m
+  integer(i4_kind) :: p, r(NN,NN,NN,NN,NN), k(NN,NN,NN,NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              do l=1, NN
+                 p = p + one
+                 k(l,j,i,m,n) = p
+              enddo
+           enddo
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 do l=1, NN
+                    if(r(l,j,i,m,n) .NE. k(l,j,i,m,n)) &
+                         call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+                 enddo
+              enddo
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 do l=1, NN
+                    if(r(l,j,i,m,n) == k(l,j,i,m,n)) &
+                         call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+                 enddo
+              enddo
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              do l=1, NN
+                 if(r(l,j,i,m,n) .NE. k(l,j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_5D_I4
+!>
+!> test_broadcast_2d_i8
+!>
+subroutine test_broadcast_2D_I8()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  integer(i8_kind), parameter  :: zero = 0, one=1
+
+  integer :: n, m
+  integer(i8_kind) :: p, r(NN,NN), k(NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        p = p + one
+        k(m,n) = p
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
   !than root_pe
   if(mpp_pe() == mpp_root_pe()) then
-    do n = 1, ARRAYSIZE
-       do m = 1, ARRAYSIZE
-          if(r(n, m) .NE. k(n, m)) call mpp_error(FATAL, "test_broadcast: on root_pe, m should equal n")
-       enddo
+     do n=1, NN
+        do m=1, NN
+           if(r(m,n) .NE. k(m,n)) call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           if(r(m,n) == k(m,n)) call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+        enddo
+     enddo
+ endif
+
+ call mpp_broadcast(r, NN*NN, mpp_root_pe())
+
+ !--- after broadcast, r and k should be the same
+ do n=1, NN
+    do m=1, NN
+       if(r(m,n) .NE. k(m,n)) call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
     enddo
+ enddo
+
+end subroutine test_broadcast_2D_I8
+!>
+!> test_broadcast_3D_i8
+!>
+subroutine test_broadcast_3D_i8()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  integer(i8_kind), parameter  :: zero = 0, one=1
+
+  integer :: i, n, m
+  integer(i8_kind) :: p, r(NN,NN,NN), k(NN,NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           p = p + one
+           k(i,m,n) = p
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              if(r(i,m,n) .NE. k(i,m,n)) call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+           enddo
+        enddo
+     enddo
   else
-    do n = 1, ARRAYSIZE
-       do m = 1, ARRAYSIZE
-          if(r(n, m) == k(n, m)) call mpp_error(FATAL, "test_broadcast: on non root_pes, m should equal n")
-       enddo
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              if(r(i,m,n) == k(i,m,n)) call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           if(r(i,m,n) .NE. k(i,m,n)) call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_3D_i8
+!>
+!> test mpp_broadcast_4d_i8
+!>
+subroutine test_broadcast_4D_i8()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  integer(i8_kind), parameter  :: zero = 0, one=1
+
+  integer :: i, j, n, m
+  integer(i8_kind) :: p, r(NN,NN,NN,NN), k(NN,NN,NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              p = p + one
+              k(j,i,m,n) = p
+           enddo
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 if(r(j,i,m,n) .NE. k(j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 if(r(j,i,m,n) == k(j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              if(r(j,i,m,n) .NE. k(j,i,m,n)) &
+                   call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+           enddo
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_4D_i8
+!>
+!> test mpp_broadcast_5d_i8
+!>
+subroutine test_broadcast_5D_I8()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  integer(i8_kind), parameter  :: zero = 0, one=1
+
+  integer :: i, j, l, n, m
+  integer(i8_kind) :: p, r(NN,NN,NN,NN,NN), k(NN,NN,NN,NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              do l=1, NN
+                 p = p + one
+                 k(l,j,i,m,n) = p
+              enddo
+           enddo
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 do l=1, NN
+                    if(r(l,j,i,m,n) .NE. k(l,j,i,m,n)) &
+                         call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+                 enddo
+              enddo
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 do l=1, NN
+                    if(r(l,j,i,m,n) == k(l,j,i,m,n)) &
+                         call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+                 enddo
+              enddo
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              do l=1, NN
+                 if(r(l,j,i,m,n) .NE. k(l,j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_5D_I8
+!>
+!> test mpp_broadcast_2d_r4
+!>
+subroutine test_broadcast_2D_R4()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  real(r4_kind), parameter  :: zero = 0., one=1.
+
+  integer :: n, m
+  real(r4_kind) :: p, r(NN,NN), k(NN,NN)
+
+  p=zero
+  do n = 1, NN
+     do m = 1, NN
+        p = p + one
+        k(m,n) = p
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n = 1, NN
+        do m = 1, NN
+           if(r(m,n) .NE. k(m,n)) call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+        enddo
+     enddo
+  else
+     do n = 1, NN
+        do m = 1, NN
+           if(r(m,n) == k(m,n)) call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+        enddo
+     enddo
+ endif
+
+ call mpp_broadcast(r, NN*NN, mpp_root_pe())
+
+ !--- after broadcast, r and k should be the same
+ do n = 1, NN
+    do m = 1, NN
+       if(r(m,n) .NE. k(m,n)) call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
     enddo
+ enddo
+
+end subroutine test_broadcast_2D_R4
+!>
+!> test mpp_broadcast_3d_r4
+!>
+subroutine test_broadcast_3D_R4()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  real(r4_kind), parameter  :: zero = 0., one=1.
+
+  integer :: i, n, m
+  real(r4_kind) :: p, r(NN,NN,NN), k(NN,NN,NN)
+
+  p=zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           p = p + one
+           k(i,m,n) = p
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              if(r(i,m,n) .NE. k(i,m,n)) call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              if(r(i,m,n) == k(i,m,n)) call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+           enddo
+        enddo
+     enddo
   endif
 
-  call mpp_broadcast(r, ARRAYSIZE*ARRAYSIZE, mpp_root_pe())
+  call mpp_broadcast(r, NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           if(r(i,m,n) .NE. k(i,m,n)) call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_3D_R4
+
+
+!> test mpp_broadcoast_4D_R4
+subroutine test_broadcast_4D_R4()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  real(r4_kind), parameter  :: zero = 0., one=1.
 
-  !--- after broadcast, m and n should be the same
-  do n = 1, ARRAYSIZE
-     do m =1, ARRAYSIZE
-        if(r(n, m) .NE. k(n, m)) call mpp_error(FATAL, "test_broadcast: after broadcast, m should equal n")
+  integer :: i, j, n, m
+  real(r4_kind) :: p, r(NN,NN,NN,NN), k(NN,NN,NN,NN)
+
+  p=zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              p = p + one
+              k(j,i,m,n) = p
+           enddo
+        enddo
      enddo
   enddo
 
-end subroutine test_broadcast_2D
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
 
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 if(r(j,i,m,n) .NE. k(j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 if(r(j,i,m,n) == k(j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              if(r(j,i,m,n) .NE. k(j,i,m,n)) &
+                   call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+           enddo
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_4D_R4
+
+!> test mpp_broadcast_5d_r4
+subroutine test_broadcast_5D_R4()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  real(r4_kind), parameter  :: zero = 0., one=1.
+
+  integer :: i, j, l, n, m
+  real(r4_kind) :: p, r(NN,NN,NN,NN,NN), k(NN,NN,NN,NN,NN)
+
+  p=zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              do l=1, NN
+                 p = p + one
+                 k(l,j,i,m,n) = p
+              enddo
+           enddo
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 do l=1, NN
+                    if(r(l,j,i,m,n) .NE. k(l,j,i,m,n)) &
+                         call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+                 enddo
+              enddo
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 do l=1, NN
+                    if(r(l,j,i,m,n) == k(l,j,i,m,n)) &
+                         call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+                 enddo
+              enddo
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              do l=1, NN
+                 if(r(l,j,i,m,n) .NE. k(l,j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_5D_R4
+
+
+!> test mpp_broadcast_2d_r8
+subroutine test_broadcast_2D_R8()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  real(r8_kind), parameter  :: zero = 0., one=1.
+
+  integer :: n, m
+  real(r8_kind) :: p, r(NN,NN), k(NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        p = p + one
+        k(m,n) = p
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           if(r(m,n) .NE. k(m,n)) call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+        enddo
+     enddo
+  else
+     do n = 1, NN
+        do m = 1, NN
+           if(r(m,n) == k(m,n)) call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        if(r(m,n) .NE. k(m,n)) call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+     enddo
+  enddo
+
+end subroutine test_broadcast_2D_R8
+!>
+!> test mpp_broadcast_3d_r8
+!>
+subroutine test_broadcast_3D_R8()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  real(r8_kind), parameter  :: zero = 0., one=1.
+
+  integer :: i, n, m
+  real(r8_kind) :: p, r(NN,NN,NN), k(NN,NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           p = p + one
+           k(i,m,n) = p
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              if(r(i,m,n) .NE. k(i,m,n)) call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              if(r(i,m,n) == k(i,m,n)) call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           if(r(i,m,n) .NE. k(i,m,n)) call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_3D_R8
+!>
+!> test mpp_broadcast_4d_R8
+!>
+subroutine test_broadcast_4D_R8()
+  implicit none
+
+  integer, parameter :: NN = 3
+  real(r8_kind), parameter  :: zero = 0., one=1.
+
+  integer :: i, j, n, m
+  real(r8_kind) :: p, r(NN,NN,NN,NN), k(NN,NN,NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              p = p + one
+              k(j,i,m,n) = p
+           enddo
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 if(r(j,i,m,n) .NE. k(j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 if(r(j,i,m,n) == k(j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+              enddo
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              if(r(j,i,m,n) .NE. k(j,i,m,n)) &
+                   call mpp_error(FATAL, "test_broadcast: after broadcast, r should equal k")
+           enddo
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_4D_R8
+!>
+!> test mpp_broadcast_5d_r8
+!>
+subroutine test_broadcast_5D_R8()
+
+  implicit none
+
+  integer, parameter :: NN = 3
+  real(r8_kind), parameter  :: zero = 0., one=1.
+
+  integer :: i, j, l, n, m
+  real(r8_kind) :: p, r(NN,NN,NN,NN,NN), k(NN,NN,NN,NN,NN)
+
+  p = zero
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              do l=1, NN
+                 p = p + one
+                 k(l,j,i,m,n) = p
+              enddo
+           enddo
+        enddo
+     enddo
+  enddo
+
+  r = k
+  if(mpp_pe() .NE. mpp_root_pe()) r = zero
+
+  !--- comparing array r and k. r and k are supposed to be different on pe other
+  !than root_pe
+  if(mpp_pe() == mpp_root_pe()) then
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 do l=1, NN
+                    if(r(l,j,i,m,n) .NE. k(l,j,i,m,n)) &
+                         call mpp_error(FATAL, "test_broadcast: on root_pe, r should equal k")
+                 enddo
+              enddo
+           enddo
+        enddo
+     enddo
+  else
+     do n=1, NN
+        do m=1, NN
+           do i=1, NN
+              do j=1, NN
+                 do l=1, NN
+                    if(r(l,j,i,m,n) == k(l,j,i,m,n)) &
+                         call mpp_error(FATAL, "test_broadcast: on non root_pes, r should not equal k")
+                 enddo
+              enddo
+           enddo
+        enddo
+     enddo
+  endif
+
+  call mpp_broadcast(r, NN*NN*NN*NN*NN, mpp_root_pe())
+
+  !--- after broadcast, r and k should be the same
+  do n=1, NN
+     do m=1, NN
+        do i=1, NN
+           do j=1, NN
+              do l=1, NN
+                 if(r(l,j,i,m,n) .NE. k(l,j,i,m,n)) &
+                      call mpp_error(FATAL, "test_broadcast: after broadcast, r should equalk")
+              enddo
+           enddo
+        enddo
+     enddo
+  enddo
+
+end subroutine test_broadcast_5D_R8
+!>
+!> test mpp_broadcast_char
+!>
 subroutine test_broadcast_char()
-  integer, parameter :: ARRAYSIZE = 3
+
+  implicit none
+
+  integer, parameter :: NN = 3
   integer, parameter :: STRINGSIZE = 256
-  character(len=STRINGSIZE), dimension(ARRAYSIZE) :: textA, textB
+  character(len=STRINGSIZE), dimension(NN) :: textA, textB
   integer :: n
 
   textA(1) = "This is line 1 "
   textA(2) = "Here comes the line 2 "
   textA(3) = "Finally is line 3 "
-  do n = 1, ARRAYSIZE
+  do n = 1, NN
      textB(n) = TextA(n)
   enddo
 
   if(mpp_pe() .NE. mpp_root_pe()) then
-     do n =1, ARRAYSIZE
+     do n =1, NN
         textA(n) = ""
      enddo
   endif
@@ -101,21 +1105,20 @@ subroutine test_broadcast_char()
   !--- comparing textA and textB. textA and textB are supposed to be
   !different on pe other than root_pe
   if(mpp_pe() == mpp_root_pe()) then
-     do n = 1, ARRAYSIZE
+     do n = 1, NN
         if(textA(n) .NE. textB(n)) call mpp_error(FATAL, "test_broadcast: on root_pe, textA should equal textB")
      enddo
   else
-     do n = 1, ARRAYSIZE
+     do n = 1, NN
         if(textA(n) == textB(n)) call mpp_error(FATAL, "test_broadcast: on root_pe, textA should not equal textB")
      enddo
   endif
   call mpp_broadcast(textA, STRINGSIZE, mpp_root_pe())
   !--- after broadcast, textA and textB should be the same
-  do n = 1, ARRAYSIZE
+  do n = 1, NN
      if(textA(n) .NE. textB(n)) call mpp_error(FATAL, "test_broadcast: after broadcast, textA should equal textB")
   enddo
 
 end subroutine test_broadcast_char
 
 end program test_mpp_broadcast
-
diff --git a/test_fms/mpp/test_mpp_domains b/test_fms/mpp/test_mpp_domains
new file mode 100755
index 0000000000..16f1ea80cf
--- /dev/null
+++ b/test_fms/mpp/test_mpp_domains
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_domains - temporary wrapper script for .libs/test_mpp_domains
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_domains program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_domains:test_mpp_domains:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_domains:test_mpp_domains:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_domains:test_mpp_domains:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_domains'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_domains.F90 b/test_fms/mpp/test_mpp_domains.F90
index 546bae31c9..3067ff2fbe 100644
--- a/test_fms/mpp/test_mpp_domains.F90
+++ b/test_fms/mpp/test_mpp_domains.F90
@@ -1,5 +1,5 @@
 !***********************************************************************
-!*                   GNU Lesser General Public License
+!*                   Gnu Lesser General Public License
 !*
 !* This file is part of the GFDL Flexible Modeling System (FMS).
 !*
@@ -24,6 +24,8 @@ program test_mpp_domains
   use mpp_mod,         only : mpp_init, mpp_exit, mpp_chksum, stdout, stderr
   use mpp_mod,         only : input_nml_file
   use mpp_mod,         only : mpp_get_current_pelist, mpp_broadcast
+  use mpp_mod,         only : mpp_init_test_requests_allocated
+
   use mpp_domains_mod, only : GLOBAL_DATA_DOMAIN, BITWISE_EXACT_SUM, BGRID_NE, CGRID_NE, DGRID_NE, AGRID
   use mpp_domains_mod, only : FOLD_SOUTH_EDGE, FOLD_NORTH_EDGE, FOLD_WEST_EDGE, FOLD_EAST_EDGE
   use mpp_domains_mod, only : MPP_DOMAIN_TIME, CYCLIC_GLOBAL_DOMAIN, NUPDATE,EUPDATE, XUPDATE, YUPDATE, SCALAR_PAIR
@@ -49,16 +51,16 @@ program test_mpp_domains
   use mpp_domains_mod, only : mpp_group_update_type, mpp_create_group_update
   use mpp_domains_mod, only : mpp_do_group_update, mpp_clear_group_update
   use mpp_domains_mod, only : mpp_start_group_update, mpp_complete_group_update
-  use mpp_domains_mod, only : WUPDATE, SUPDATE, mpp_get_compute_domains, NONSYMEDGEUPDATE
-  use mpp_domains_mod, only : domainUG, mpp_define_unstruct_domain, mpp_get_UG_domain_tile_id
-  use mpp_domains_mod, only : mpp_get_UG_compute_domain, mpp_pass_SG_to_UG, mpp_pass_UG_to_SG
-  use mpp_domains_mod, only : mpp_get_ug_global_domain, mpp_global_field_ug, mpp_get_tile_id
+  use mpp_domains_mod, only : WUPDATE, SUPDATE, mpp_get_compute_domains, NONSYMEDGEUPDATE, mpp_get_tile_id
   use mpp_memutils_mod, only : mpp_memuse_begin, mpp_memuse_end
   use fms_affinity_mod, only : fms_affinity_set
+  use mpp_io_mod,       only: mpp_io_init
+  use compare_data_checksums
+  use test_domains_utility_mod
+  use platform_mod
 
 
   implicit none
-#include "../../include/fms_platform.h"
   integer :: pe, npes
   integer :: nx=128, ny=128, nz=40, stackmax=4000000
   integer :: unit=7
@@ -94,7 +96,6 @@ program test_mpp_domains
 
   logical :: mix_2D_3D = .false.
   logical :: test_subset = .false.
-  logical :: test_unstruct = .false.
   integer :: nthreads = 1
   logical :: test_adjoint = .false.
   logical :: wide_halo = .false.
@@ -124,16 +125,20 @@ program test_mpp_domains
                                extra_halo, npes_nest_tile, cyclic_nest, mix_2D_3D, test_get_nbr, &
                                test_edge_update, test_cubic_grid_redistribute, ensemble_size, &
                                layout_cubic, layout_ensemble, nthreads, test_boundary, &
-                               layout_tripolar, test_group, test_global_sum, test_subset, test_unstruct, &
+                               layout_tripolar, test_group, test_global_sum, test_subset, &
                                test_nonsym_edge, test_halosize_performance, test_adjoint, wide_halo
   integer :: i, j, k, n
   integer :: layout(2)
   integer :: id
   integer :: outunit, errunit, io_status
   integer :: omp_get_num_threads, omp_get_thread_num
+  integer                         :: ierr
+
 
-  call mpp_memuse_begin()
-  call mpp_init()
+  call mpp_init(test_level=mpp_init_test_requests_allocated)
+  call mpp_domains_init(MPP_DEBUG)
+  call mpp_io_init()
+  call mpp_domains_set_stack_size(stackmax)
 
   outunit = stdout()
   errunit = stderr()
@@ -146,6 +151,7 @@ program test_mpp_domains
      unit = unit + 1
      if( unit.EQ.100 )call mpp_error( FATAL, 'Unable to locate unit number.' )
   end do
+
   open( unit=unit, file='input.nml', iostat=io_status )
   read( unit,test_mpp_domains_nml, iostat=io_status )
   close(unit)
@@ -155,37 +161,13 @@ program test_mpp_domains
      call mpp_error(FATAL,'=>test_mpp_domains: Error reading input.nml')
   endif
 
-  select case(trim(warn_level))
-  case("fatal")
-     call mpp_set_warn_level(FATAL)
-  case("warning")
-     call mpp_set_warn_level(WARNING)
-  case default
-     call mpp_error(FATAL, "test_mpp_domains: warn_level should be fatal or warning")
-  end select
-
   pe = mpp_pe()
   npes = mpp_npes()
 
-  !--- initialize mpp domains
-  if( (.not.debug) .and. test_nest ) then
-      call mpp_domains_init()
-  elseif( debug )then
-      call mpp_domains_init(MPP_DEBUG)
-  else
-      call mpp_domains_init(MPP_DOMAIN_TIME)
-  end if
-  call mpp_domains_set_stack_size(stackmax)
-
-!$  call omp_set_num_threads(nthreads)
-!$OMP PARALLEL
-!$  call fms_affinity_set("test_mpp_domains", .FALSE., omp_get_num_threads())
-!$OMP END PARALLEL
-
-  if( pe.EQ.mpp_root_pe() )print '(a,9i6)', 'npes, mpes, nx, ny, nz, whalo, ehalo, shalo, nhalo =', &
-                           npes, mpes, nx, ny, nz, whalo, ehalo, shalo, nhalo
-  call mpp_memuse_end("in the begining", outunit)
-
+  if( pe.EQ.mpp_root_pe() ) then
+     print '(a,9i6)', 'npes, mpes, nx, ny, nz, whalo, ehalo, shalo, nhalo =', &
+          npes, mpes, nx, ny, nz, whalo, ehalo, shalo, nhalo
+  endif
   !--- wide_halo_x and wide_halo_y must be either both 0 or both positive.
   if( wide_halo_x < 0 .OR. wide_halo_y < 0) call mpp_error(FATAL, &
      "test_mpp_domain: both wide_halo_x and wide_halo_y should be non-negative")
@@ -210,16 +192,11 @@ program test_mpp_domains
      enddo
      if(ANY(refine_ratio(:).LT.1)) call  mpp_error(FATAL, &
         "test_mpp_domain: check the setting of namelist variable refine_ratio")
-     call test_update_nest_domain('Cubic-Grid')
+     call test_update_nest_domain_r8('Cubic-Grid')
+     call test_update_nest_domain_r4('Cubic-Grid')
     if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Finished test_update_nest_domain <-------------------'
   endif
 
-  if(test_subset) then
-      if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_subset_update <-------------------'
-      call test_subset_update()
-      if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Finished test_subset_update <-------------------'
-  endif
-
   if( test_halosize_performance ) then
      if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_halosize_performance <-------------------'
      call test_halosize_update( 'Folded-north' )
@@ -243,13 +220,6 @@ program test_mpp_domains
      if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Finished test_nonsym_edge <-------------------'
   endif
 
-  if( test_performance) then
-     if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_performance <-------------------'
-      call update_domains_performance('Folded-north')
-      call update_domains_performance('Cubic-Grid')
-     if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Finished test_performance <-------------------'
-  endif
-
   if( test_global_sum ) then
       if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_mpp_global_sum <-------------------'
       call test_mpp_global_sum('Folded-north')
@@ -275,17 +245,9 @@ program test_mpp_domains
   if (test_adjoint) then
       if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_adjoint <-------------------'
        call test_get_boundary_ad('Four-Tile')
-       call test_halo_update_ad( 'Simple' )
-       call test_global_reduce_ad( 'Simple')
       if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Finished test_adjoint <-------------------'
   endif
 
-  if( test_unstruct) then
-      if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_unstruct <-------------------'
-     call test_unstruct_update( 'Cubic-Grid' )
-      if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_unstruct <-------------------'
-  endif
-
   if( test_group) then
       if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_group <-------------------'
      call test_group_update( 'Folded-north' )
@@ -312,32 +274,6 @@ program test_mpp_domains
       call test_uniform_mosaic('Cubic-Grid') ! 6 tiles.
       call test_nonuniform_mosaic('Five-Tile')
 
-      call test_halo_update( 'Simple' ) !includes global field, global sum tests
-      call test_halo_update( 'Cyclic' )
-      call test_halo_update( 'Folded-north' ) !includes vector field test
-!      call test_halo_update( 'Masked' ) !includes vector field test
-      call test_halo_update( 'Folded xy_halo' ) !
-      if(.not. wide_halo) then
-         call test_halo_update( 'Simple symmetry' ) !includes global field, global sum tests
-         call test_halo_update( 'Cyclic symmetry' )
-      endif
-      call test_halo_update( 'Folded-north symmetry' ) !includes vector field test
-      if(.not. wide_halo) then
-         call test_halo_update( 'Folded-south symmetry' ) !includes vector field test
-         call test_halo_update( 'Folded-west symmetry' ) !includes vector field test
-         call test_halo_update( 'Folded-east symmetry' ) !includes vector field test
-      endif
-
-      !--- z1l: The following will not work due to symmetry and domain%x is cyclic.
-      !--- Will solve this problem in the future if needed.
-      ! call test_halo_update( 'Masked symmetry' ) !includes vector field test
-
-      call test_global_field( 'Non-symmetry' )
-      call test_global_field( 'Symmetry center' )
-      call test_global_field( 'Symmetry corner' )
-      call test_global_field( 'Symmetry east' )
-      call test_global_field( 'Symmetry north' )
-
       if(.not. wide_halo) then
          call test_global_reduce( 'Simple')
          call test_global_reduce( 'Simple symmetry center')
@@ -383,8 +319,7 @@ program test_mpp_domains
       if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Finish test_get_nbr <-------------------'
   endif
 
-  call mpp_domains_exit()
-  call mpp_exit()
+  call MPI_finalize(ierr)
 
 contains
   subroutine test_openmp()
@@ -690,6 +625,8 @@ subroutine test_redistribute( type )
     if(ALLOCATED(y))deallocate(y,y2,y3,y4,y5,y6)
   end subroutine test_redistribute
 
+  !#######################################################################
+
   subroutine cubic_grid_redistribute
 
      integer              :: npes, npes_per_ensemble, npes_per_tile
@@ -858,7 +795,7 @@ subroutine cubic_grid_redistribute
 
   end subroutine cubic_grid_redistribute
 
-
+   !###################################################
   subroutine test_uniform_mosaic( type )
     character(len=*), intent(in) :: type
 
@@ -1021,7 +958,6 @@ subroutine test_uniform_mosaic( type )
     allocate(istart1(num_contact), iend1(num_contact), jstart1(num_contact), jend1(num_contact) )
     allocate(istart2(num_contact), iend2(num_contact), jstart2(num_contact), jend2(num_contact) )
 
-    call mpp_memuse_begin()
     !--- define domain
     if(single_tile) then
        !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)
@@ -1102,7 +1038,6 @@ subroutine test_uniform_mosaic( type )
        call define_cubic_mosaic(type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
                                 global_indices, layout2D, pe_start, pe_end )
     endif
-    call mpp_memuse_end(trim(type)//" mpp_define_mosaic", outunit )
 
     !--- setup data
     allocate(global2(1-whalo:nx+ehalo,1-shalo:ny+nhalo,nz, ntile_per_pe) )
@@ -1735,109 +1670,87 @@ subroutine test_uniform_mosaic( type )
 
   end subroutine test_uniform_mosaic
 
-  !#################################################################################
-  subroutine update_domains_performance( type )
+  !###############################################################
+  subroutine test_mpp_global_sum( type )
     character(len=*), intent(in) :: type
 
     type(domain2D) :: domain
-    integer        :: num_contact, ntiles, npes_per_tile, ntile_per_pe
+    integer        :: num_contact, ntiles, npes_per_tile
     integer        :: i, j, k, l, n, shift
-    integer        :: ism, iem, jsm, jem
     integer        :: isc, iec, jsc, jec, isd, ied, jsd, jed
+    integer        :: ism, iem, jsm, jem
 
-    integer, allocatable, dimension(:)       :: tile
     integer, allocatable, dimension(:)       :: pe_start, pe_end, tile1, tile2
     integer, allocatable, dimension(:)       :: istart1, iend1, jstart1, jend1
     integer, allocatable, dimension(:)       :: istart2, iend2, jstart2, jend2
     integer, allocatable, dimension(:,:)     :: layout2D, global_indices
-    real,    allocatable, dimension(:,:,:,:) :: x, x1, y, y1, x_save, y_save
-    real,    allocatable, dimension(:,:,:,:) :: a, a1, b, b1
-    real,    allocatable, dimension(:,:,:  ) :: a1_2D, b1_2D
-    integer            :: id_update
-    integer            :: id1, id2
-    logical            :: folded_north
-    logical            :: cubic_grid, single_tile, four_tile
+    real,    allocatable, dimension(:,:,:)   :: data_3D
+    real,    allocatable, dimension(:,:)     :: data_2D
+
+    integer(kind=8)    :: mold
+    logical            :: folded_north, cubic_grid
     character(len=3)   :: text
     integer            :: nx_save, ny_save
-    integer            :: id_single, id_update_single
+    integer            :: id1, id2, id3, id4
+    real               :: gsum1, gsum2, gsum3, gsum4
 
     folded_north       = .false.
     cubic_grid         = .false.
-    single_tile        = .false.
-    four_tile          = .false.
+
     nx_save = nx
     ny_save = ny
     !--- check the type
     select case(type)
-    case ( 'Single-Tile' )   !--- single with cyclic along x- and y-direction
-       single_tile = .true.
-       ntiles = 1
-       num_contact = 2
     case ( 'Folded-north' )
        ntiles = 1
+       shift = 0
        num_contact = 2
        folded_north = .true.
-    case ( 'Four-Tile' ) !--- cyclic along both x- and y-direction.
-       ntiles = 4
-       num_contact = 8
-       four_tile = .true.
+       npes_per_tile = npes
+       if(layout_tripolar(1)*layout_tripolar(2) == npes ) then
+          layout = layout_tripolar
+       else
+          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+       endif
     case ( 'Cubic-Grid' )
        if( nx_cubic == 0 ) then
-          call mpp_error(NOTE,'update_domains_performance: for Cubic_grid mosaic, nx_cubic is zero, '//&
-                  'No test is done for Cubic-Grid mosaic. ' )
+          call mpp_error(NOTE,'test_group_update: for Cubic_grid mosaic, nx_cubic is zero, '//&
+               'No test is done for Cubic-Grid mosaic. ' )
           return
        endif
        if( nx_cubic .NE. ny_cubic ) then
-          call mpp_error(NOTE,'update_domains_performance: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
-                  'No test is done for Cubic-Grid mosaic. ' )
+          call mpp_error(NOTE,'test_group_update: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
+               'No test is done for Cubic-Grid mosaic. ' )
           return
        endif
-
+       shift = 1
        nx = nx_cubic
        ny = ny_cubic
        ntiles = 6
        num_contact = 12
        cubic_grid = .true.
-
+       if( mod(npes, ntiles) == 0 ) then
+          npes_per_tile = npes/ntiles
+          write(outunit,*)'NOTE from test_mpp_global_sum ==> For Mosaic "', trim(type), &
+               '", each tile will be distributed over ', npes_per_tile, ' processors.'
+       else
+          call mpp_error(NOTE,'test_group_update: npes should be multiple of ntiles No test is done for '//trim(type))
+          return
+       endif
+       if(layout_cubic(1)*layout_cubic(2) == npes_per_tile) then
+          layout = layout_cubic
+       else
+          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+       endif
     case default
-       call mpp_error(FATAL, 'update_domains_performance: no such test: '//type)
+       call mpp_error(FATAL, 'test_mpp_global_sum: no such test: '//type)
     end select
 
     allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
-    if( mod(npes, ntiles) == 0 ) then
-       npes_per_tile = npes/ntiles
-       write(outunit,*)'NOTE from update_domains_performance ==> For Mosaic "', trim(type), &
-                       '", each tile will be distributed over ', npes_per_tile, ' processors.'
-       ntile_per_pe = 1
-       allocate(tile(ntile_per_pe))
-       tile = pe/npes_per_tile+1
-       if(cubic_grid) then
-          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
-       else
-          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
-       endif
-       do n = 1, ntiles
-          pe_start(n) = (n-1)*npes_per_tile
-          pe_end(n)   = n*npes_per_tile-1
-       end do
-    else if ( mod(ntiles, npes) == 0 ) then
-       ntile_per_pe = ntiles/npes
-       write(outunit,*)'NOTE from update_domains_performance ==> For Mosaic "', trim(type), &
-                        '", there will be ', ntile_per_pe, ' tiles on each processor.'
-       allocate(tile(ntile_per_pe))
-       do n = 1, ntile_per_pe
-          tile(n) = pe*ntile_per_pe + n
-       end do
-       do n = 1, ntiles
-          pe_start(n) = (n-1)/ntile_per_pe
-          pe_end(n)   = pe_start(n)
-       end do
-       layout = 1
-    else
-       call mpp_error(NOTE,'update_domains_performance: npes should be multiple of ntiles or ' // &
-            'ntiles should be multiple of npes. No test is done for '//trim(type) )
-       return
-    end if
+    do n = 1, ntiles
+       pe_start(n) = (n-1)*npes_per_tile
+       pe_end(n)   = n*npes_per_tile-1
+    end do
 
     do n = 1, ntiles
        global_indices(:,n) = (/1,nx,1,ny/)
@@ -1849,20 +1762,7 @@ subroutine update_domains_performance( type )
     allocate(istart2(num_contact), iend2(num_contact), jstart2(num_contact), jend2(num_contact) )
 
     !--- define domain
-    if(single_tile) then
-       !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)
-       tile1(1) = 1; tile2(1) = 1
-       istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
-       istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
-       !--- Contact line 2, between tile 1 (SOUTH) and tile 1 (NORTH)  --- cyclic
-       tile1(2) = 1; tile2(2) = 1
-       istart1(2) = 1;  iend1(2) = nx; jstart1(2) = 1;   jend1(2) = 1
-       istart2(2) = 1;  iend2(2) = nx; jstart2(2) = ny;  jend2(2) = ny
-       call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
-            istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
-            pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
-            name = type, symmetry = .false. )
-    else if(folded_north) then
+    if(folded_north) then
        !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)  --- cyclic
        tile1(1) = 1; tile2(1) = 1
        istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
@@ -1872,463 +1772,126 @@ subroutine update_domains_performance( type )
        istart1(2) = 1;  iend1(2) = nx/2;   jstart1(2) = ny;  jend1(2) = ny
        istart2(2) = nx; iend2(2) = nx/2+1; jstart2(2) = ny;  jend2(2) = ny
        call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
-                              istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
-                              pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
-                              name = type, symmetry = .false.  )
-    else if( four_tile ) then
-       call define_fourtile_mosaic(type, domain, (/nx,nx,nx,nx/), (/ny,ny,ny,ny/), global_indices, &
-                                   layout2D, pe_start, pe_end, symmetry = .false. )
+            istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+            pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+            name = type, symmetry = .false.  )
     else if( cubic_grid ) then
        call define_cubic_mosaic(type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
-                                global_indices, layout2D, pe_start, pe_end )
+            global_indices, layout2D, pe_start, pe_end )
     endif
 
     !--- setup data
     call mpp_get_compute_domain( domain, isc, iec, jsc, jec )
     call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
-    call mpp_get_memory_domain   ( domain, ism, iem, jsm, jem )
-    allocate( x (ism:iem,jsm:jem,nz, ntile_per_pe) )
-    allocate( x_save (ism:iem,jsm:jem,nz, ntile_per_pe) )
-    allocate( a (ism:iem,jsm:jem,nz, ntile_per_pe) )
-    x = 0
-    do l = 1, ntile_per_pe
-       do k = 1, nz
-          do j = jsc, jec
-             do i = isc, iec
-                x(i, j, k, l) = tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
-             enddo
+
+    allocate(data_2d(isd:ied,jsd:jed))
+    allocate(data_3d(isd:ied,jsd:jed,nz))
+
+    do k = 1, nz
+       do j = jsd, jed
+          do i = isd, ied
+             data_3d(i,j,k) =  k*1e3 + i + j*1e-3
           enddo
        enddo
     enddo
 
-    a  = x
-    x_save = x
-
-    if(num_fields<1) then
-       call mpp_error(FATAL, "test_mpp_domains: num_fields must be a positive integer")
-    endif
-
-    id1 = mpp_clock_id( type, flags=MPP_CLOCK_SYNC)
-    id_single = mpp_clock_id( type//' non-blocking', flags=MPP_CLOCK_SYNC)
+    do j = jsd, jed
+       do i = isd, ied
+          data_2d(i,j) =  i*1e3 + j*1e-3
+       enddo
+    enddo
 
+    id1 = mpp_clock_id( type//' bitwise sum 3D', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( type//' EFP sum 3D', flags=MPP_CLOCK_SYNC )
+    id3 = mpp_clock_id( type//' EFP sum 3D check', flags=MPP_CLOCK_SYNC )
+    id4 = mpp_clock_id( type//' non-bitwise sum 3D', flags=MPP_CLOCK_SYNC )
 
     call mpp_clock_begin(id1)
-    call mpp_update_domains( x, domain)
-    call mpp_clock_end  (id1)
+    do n = 1, num_iter
+       gsum1 = mpp_global_sum(domain, data_3d, flags=BITWISE_EXACT_SUM)
+    enddo
+    call mpp_clock_end(id1)
 
-    call mpp_clock_begin(id_single)
-    id_update_single =  mpp_start_update_domains(a, domain)
-    call mpp_clock_end  (id_single)
+    call mpp_clock_begin(id2)
+    do n = 1, num_iter
+       gsum2 = mpp_global_sum(domain, data_3d, flags=BITWISE_EFP_SUM)
+    enddo
+    call mpp_clock_end(id2)
 
-    !---- sleep some time for non-blocking.
-    if(do_sleep) call sleep(1)
+    call mpp_clock_begin(id3)
+    do n = 1, num_iter
+       gsum3 = mpp_global_sum(domain, data_3d, flags=BITWISE_EFP_SUM, overflow_check=.true. )
+    enddo
+    call mpp_clock_end(id3)
 
-    id1 = mpp_clock_id( type//' group', flags=MPP_CLOCK_SYNC )
-    id2 = mpp_clock_id( type//' group non-blocking', flags=MPP_CLOCK_SYNC )
+    call mpp_clock_begin(id4)
+    do n = 1, num_iter
+       gsum4= mpp_global_sum(domain, data_3d)
+    enddo
+    call mpp_clock_end(id4)
 
+    write(outunit, *) " ********************************************************************************"
+    write(outunit, *) " global sum for "//type//' bitwise exact sum 3D = ', gsum1
+    write(outunit, *) " global sum for "//type//' bitwise EFP sum 3D = ', gsum2
+    write(outunit, *) " global sum for "//type//' bitwise EFP sum 3D with overflow_check = ', gsum3
+    write(outunit, *) " global sum for "//type//' non-bitwise sum 3D = ', gsum4
+    write(outunit, *) " "
+    write(outunit, *) " chksum for "//type//' bitwise exact sum 3D = ', transfer(gsum1, mold)
+    write(outunit, *) " chksum for "//type//' bitwise EFP sum 3D = ', transfer(gsum2, mold)
+    write(outunit, *) " chksum for "//type//' bitwise EFP sum 3D with overflow_check = ', transfer(gsum3, mold)
+    write(outunit, *) " chksum for "//type//' non-bitwise sum 3D = ', transfer(gsum4, mold)
+    write(outunit, *) " ********************************************************************************"
 
-    if(ntile_per_pe == 1) then
-       allocate( x1(ism:iem,jsm:jem,nz, num_fields) )
-       allocate( a1(ism:iem,jsm:jem,nz, num_fields) )
-       if(mix_2D_3D) allocate( a1_2D(ism:iem,jsm:jem,num_fields) )
+    id1 = mpp_clock_id( type//' bitwise sum 2D', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( type//' EFP sum 2D', flags=MPP_CLOCK_SYNC )
+    id3 = mpp_clock_id( type//' EFP sum 2D check', flags=MPP_CLOCK_SYNC )
+    id4 = mpp_clock_id( type//' non-bitwise sum 2D', flags=MPP_CLOCK_SYNC )
 
-       do n = 1, num_iter
-          do l = 1, num_fields
-             x1(:,:,:,l) = x_save(:,:,:,1)
-             a1(:,:,:,l) = x_save(:,:,:,1)
-             if(mix_2D_3D) a1_2D(:,:,l) = x_save(:,:,1,1)
-          enddo
+    call mpp_clock_begin(id1)
+    do n = 1, num_iter
+       gsum1 = mpp_global_sum(domain, data_2d, flags=BITWISE_EXACT_SUM)
+    enddo
+    call mpp_clock_end(id1)
 
-          call mpp_clock_begin(id1)
-          do l = 1, num_fields
-             call mpp_update_domains( x1(:,:,:,l), domain, complete=l==num_fields )
-          enddo
-          call mpp_clock_end  (id1)
-
-          ! non-blocking update
-          call mpp_clock_begin(id2)
-          if( n == 1 ) then
-             do l = 1, num_fields
-                if(mix_2D_3D) id_update =  mpp_start_update_domains(a1_2D(:,:,l), domain, complete=.false.)
-                id_update =  mpp_start_update_domains(a1(:,:,:,l), domain, complete=l==num_fields)
-             enddo
-          else
-             do l = 1, num_fields
-                if(mix_2D_3D) id_update =  mpp_start_update_domains(a1_2D(:,:,l), domain, update_id=id_update, complete=.false.)
-                id_update =  mpp_start_update_domains(a1(:,:,:,l), domain, update_id=id_update, complete=l==num_fields)
-             enddo
-          endif
-          call mpp_clock_end  (id2)
+    call mpp_clock_begin(id2)
+    do n = 1, num_iter
+       gsum2 = mpp_global_sum(domain, data_2d, flags=BITWISE_EFP_SUM)
+    enddo
+    call mpp_clock_end(id2)
 
-          !---- sleep some time for non-blocking.
-          if(do_sleep) call sleep(1)
+    call mpp_clock_begin(id3)
+    do n = 1, num_iter
+       gsum3 = mpp_global_sum(domain, data_2d, flags=BITWISE_EFP_SUM, overflow_check=.true. )
+    enddo
+    call mpp_clock_end(id3)
 
-          call mpp_clock_begin(id2)
-          do l = 1, num_fields
-             if(mix_2D_3D) call mpp_complete_update_domains(id_update, a1_2D(:,:,l), domain, complete=.false.)
-             call mpp_complete_update_domains(id_update, a1(:,:,:,l), domain, complete=l==num_fields)
-          enddo
-          call mpp_clock_end  (id2)
+    call mpp_clock_begin(id4)
+    do n = 1, num_iter
+       gsum4= mpp_global_sum(domain, data_2d)
+    enddo
+    call mpp_clock_end(id4)
 
+    write(outunit, *) " ********************************************************************************"
+    write(outunit, *) " global sum for "//type//' bitwise exact sum 2D = ', gsum1
+    write(outunit, *) " global sum for "//type//' bitwise EFP sum 2D = ', gsum2
+    write(outunit, *) " global sum for "//type//' bitwise EFP sum 2D with overflow_check = ', gsum3
+    write(outunit, *) " global sum for "//type//' non-bitwise sum 2D = ', gsum4
+    write(outunit, *) " "
+    write(outunit, *) " chksum for "//type//' bitwise exact sum 2D = ', transfer(gsum1, mold)
+    write(outunit, *) " chksum for "//type//' bitwise EFP sum 2D = ', transfer(gsum2, mold)
+    write(outunit, *) " chksum for "//type//' bitwise EFP sum 2D with overflow_check = ', transfer(gsum3, mold)
+    write(outunit, *) " chksum for "//type//' non-bitwise sum 2D = ', transfer(gsum4, mold)
+    write(outunit, *) " ********************************************************************************"
 
-          !--- compare checksum
-          do l = 1, num_fields
-             write(text, '(i3.3)') l
-             call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), type//' X'//text)
-          enddo
-          if(mix_2D_3D)call compare_checksums( x1(:,:,1,:), a1_2D(:,:,:), type//' X 2D')
-       enddo
-       deallocate(x1, a1)
-       if(mix_2D_3D) deallocate(a1_2D)
-    endif
 
-    call mpp_clock_begin(id_single)
-    call mpp_complete_update_domains(id_update_single, a, domain)
-    call mpp_clock_end  (id_single)
-    call compare_checksums( x(:,:,:,1), a(:,:,:,1), type)
-    deallocate(x, a, x_save)
 
+    nx = nx_save
+    ny = ny_save
 
-    !------------------------------------------------------------------
-    !              vector update : BGRID_NE, one extra point in each direction for cubic-grid
-    !------------------------------------------------------------------
-    !--- setup data
-    shift = 0
-    if(single_tile .or. four_tile .or. folded_north) then
-       shift = 0
-    else
-       shift = 1
-    endif
-
-    allocate( x (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
-    allocate( y (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
-    allocate( x_save (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
-    allocate( y_save (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
-    allocate( a (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
-    allocate( b (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
-    x = 0
-    y = 0
-    do l = 1, ntile_per_pe
-       do k = 1, nz
-          do j = jsc, jec+shift
-             do i = isc, iec+shift
-                x(i,j,k,l) = 1.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
-                y(i,j,k,l) = 2.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
-             end do
-          end do
-       end do
-    enddo
-    a  = x; b  = y
-    x_save  = x; y_save  = y
-
-    id1 = mpp_clock_id( trim(type)//' BGRID', flags=MPP_CLOCK_SYNC )
-    id_single = mpp_clock_id( trim(type)//' BGRID non-blocking', flags=MPP_CLOCK_SYNC )
-
-    call mpp_clock_begin(id1)
-    call mpp_update_domains( x, y, domain, gridtype=BGRID_NE)
-    call mpp_clock_end  (id1)
-
-    !--- non-blocking update
-    call mpp_clock_begin(id_single)
-    id_update_single =  mpp_start_update_domains(a, b, domain, gridtype=BGRID_NE)
-    call mpp_clock_end  (id_single)
-
-    !---- sleep some time for non-blocking.
-    if(do_sleep) call sleep(1)
-
-    id1 = mpp_clock_id( trim(type)//' BGRID group', flags=MPP_CLOCK_SYNC)
-    id2 = mpp_clock_id( trim(type)//' BGRID group non-blocking', flags=MPP_CLOCK_SYNC)
-    if(ntile_per_pe == 1) then
-       allocate( x1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
-       allocate( y1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
-       allocate( a1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
-       allocate( b1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
-       if(mix_2D_3D) then
-          allocate( a1_2D(ism:iem+shift,jsm:jem+shift,num_fields) )
-          allocate( b1_2D(ism:iem+shift,jsm:jem+shift,num_fields) )
-       endif
-
-       do n = 1, num_iter
-          do l = 1, num_fields
-             x1(:,:,:,l) = x_save(:,:,:,1)
-             a1(:,:,:,l) = x_save(:,:,:,1)
-             y1(:,:,:,l) = y_save(:,:,:,1)
-             b1(:,:,:,l) = y_save(:,:,:,1)
-             if(mix_2D_3D) then
-                a1_2D(:,:,l) = x_save(:,:,1,1)
-                b1_2D(:,:,l) = y_save(:,:,1,1)
-             endif
-          enddo
-
-          call mpp_clock_begin(id1)
-          do l = 1, num_fields
-             call mpp_update_domains( x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=BGRID_NE, complete=l==num_fields )
-          enddo
-          call mpp_clock_end  (id1)
-
-          !--- non-blocking update
-          call mpp_clock_begin(id2)
-          if( n == 1 ) then
-             do l = 1, num_fields
-                if(mix_2D_3D) id_update =  mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, &
-                             gridtype=BGRID_NE, complete=.false.)
-                id_update =  mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, &
-                             gridtype=BGRID_NE, complete=l==num_fields)
-             enddo
-          else
-             do l = 1, num_fields
-                if(mix_2D_3D) id_update =  mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, gridtype=BGRID_NE, &
-                                update_id=id_update, complete=.false.)
-                id_update =  mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, gridtype=BGRID_NE, &
-                                update_id=id_update, complete=l==num_fields)
-             enddo
-          endif
-          call mpp_clock_end  (id2)
-
-          !---- sleep some time for non-blocking.
-          if(do_sleep) call sleep(1)
-
-          call mpp_clock_begin(id2)
-          do l = 1, num_fields
-             if(mix_2D_3D)call mpp_complete_update_domains(id_update, a1_2D(:,:,l), b1_2D(:,:,l), domain, &
-                                              gridtype=BGRID_NE, complete=.false.)
-             call mpp_complete_update_domains(id_update, a1(:,:,:,l), b1(:,:,:,l), domain, &
-                                              gridtype=BGRID_NE, complete=l==num_fields)
-          enddo
-          call mpp_clock_end  (id2)
-
-          !--- compare checksum
-          do l = 1, num_fields
-             write(text, '(i3.3)') l
-             call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), type//' BGRID X'//text)
-             call compare_checksums( y1(:,:,:,l), b1(:,:,:,l), type//' BGRID Y'//text)
-             if(mix_2D_3D) then
-                call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), type//' BGRID X'//text)
-                call compare_checksums( y1(:,:,:,1), b1(:,:,:,1), type//' BGRID Y'//text)
-             endif
-          enddo
-          if(mix_2D_3D) then
-             call compare_checksums( x1(:,:,1,:), a1_2D(:,:,:), type//' BGRID X 2D')
-             call compare_checksums( y1(:,:,1,:), b1_2D(:,:,:), type//' BGRID Y 2D')
-          endif
-       enddo
-       deallocate(x1, y1, a1, b1)
-       if(mix_2D_3D) deallocate(a1_2D, b1_2D)
-    endif
-
-    call mpp_clock_begin(id_single)
-    call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=BGRID_NE)
-    call mpp_clock_end  (id_single)
-
-
-    !--- compare checksum
-
-    call compare_checksums( x(:,:,:,1), a(:,:,:,1), type//' BGRID X')
-    call compare_checksums( y(:,:,:,1), b(:,:,:,1), type//' BGRID Y')
-
-
-    deallocate(x, y, a, b, x_save, y_save)
-    !------------------------------------------------------------------
-    !              vector update : CGRID_NE, one extra point in each direction for cubic-grid
-    !------------------------------------------------------------------
-    allocate( x (ism:iem+shift,jsm:jem  ,nz,ntile_per_pe) )
-    allocate( y (ism:iem  ,jsm:jem+shift,nz,ntile_per_pe) )
-    allocate( a (ism:iem+shift,jsm:jem  ,nz,ntile_per_pe) )
-    allocate( b (ism:iem  ,jsm:jem+shift,nz,ntile_per_pe) )
-    allocate( x_save (ism:iem+shift,jsm:jem  ,nz,ntile_per_pe) )
-    allocate( y_save (ism:iem  ,jsm:jem+shift,nz,ntile_per_pe) )
-
-
-    x = 0
-    y = 0
-    do l = 1, ntile_per_pe
-       do k = 1, nz
-          do j = jsc, jec
-             do i = isc, iec+shift
-                x(i,j,k,l) = 1.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
-             end do
-          end do
-          do j = jsc, jec+shift
-             do i = isc, iec
-                y(i,j,k,l) = 2.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
-             end do
-          end do
-       end do
-    enddo
-
-    a  = x; b  = y
-    x_save  = x; y_save  = y
-
-    id1 = mpp_clock_id( trim(type)//' CGRID', flags=MPP_CLOCK_SYNC )
-    id_single = mpp_clock_id( trim(type)//' CGRID non-blocking', flags=MPP_CLOCK_SYNC )
-
-    call mpp_clock_begin(id1)
-    call mpp_update_domains( x, y, domain, gridtype=CGRID_NE)
-    call mpp_clock_end  (id1)
-
-    !--- non-blocking update
-    call mpp_clock_begin(id_single)
-    id_update_single =  mpp_start_update_domains(a, b, domain, gridtype=CGRID_NE)
-    call mpp_clock_end  (id_single)
-
-    !---- sleep some time for non-blocking.
-    if(do_sleep) call sleep(1)
-
-    id1 = mpp_clock_id( trim(type)//' CGRID group', flags=MPP_CLOCK_SYNC )
-    id2 = mpp_clock_id( trim(type)//' CGRID group non-blocking', flags=MPP_CLOCK_SYNC )
-
-    if(ntile_per_pe == 1) then
-       allocate( x1(ism:iem+shift,jsm:jem      ,nz,num_fields) )
-       allocate( y1(ism:iem      ,jsm:jem+shift,nz,num_fields) )
-       allocate( a1(ism:iem+shift,jsm:jem      ,nz,num_fields) )
-       allocate( b1(ism:iem      ,jsm:jem+shift,nz,num_fields) )
-       if(mix_2D_3D) then
-          allocate( a1_2D(ism:iem+shift,jsm:jem      ,num_fields) )
-          allocate( b1_2D(ism:iem      ,jsm:jem+shift,num_fields) )
-       endif
-
-       do n = 1, num_iter
-          do l = 1, num_fields
-             x1(:,:,:,l) = x_save(:,:,:,1)
-             a1(:,:,:,l) = x_save(:,:,:,1)
-             y1(:,:,:,l) = y_save(:,:,:,1)
-             b1(:,:,:,l) = y_save(:,:,:,1)
-             if(mix_2D_3D) then
-                a1_2D(:,:,l) = x_save(:,:,1,1)
-                b1_2D(:,:,l) = y_save(:,:,1,1)
-             endif
-          enddo
-
-          call mpp_clock_begin(id1)
-          do l = 1, num_fields
-             call mpp_update_domains( x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=CGRID_NE, complete=l==num_fields )
-          enddo
-          call mpp_clock_end  (id1)
-
-          !--- non-blocking update
-          call mpp_clock_begin(id2)
-          if( n == 1 ) then
-             do l = 1, num_fields
-                if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, &
-                       gridtype=CGRID_NE, complete=.false.)
-                id_update = mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, &
-                       gridtype=CGRID_NE, complete=l==num_fields)
-             enddo
-          else
-             do l = 1, num_fields
-             if(mix_2D_3D)id_update = mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, gridtype=CGRID_NE, &
-                            update_id=id_update, complete=.false.)
-             id_update = mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, gridtype=CGRID_NE, &
-                            update_id=id_update, complete=l==num_fields)
-             enddo
-          endif
-          call mpp_clock_end  (id2)
-
-          !---- sleep some time for non-blocking.
-          if(do_sleep) call sleep(1)
-
-          call mpp_clock_begin(id2)
-          do l = 1, num_fields
-             if(mix_2D_3D)call mpp_complete_update_domains(id_update, a1_2D(:,:,l), b1_2D(:,:,l), domain, &
-                   gridtype=CGRID_NE, complete=.false.)
-             call mpp_complete_update_domains(id_update, a1(:,:,:,l), b1(:,:,:,l), domain, &
-                   gridtype=CGRID_NE, complete=l==num_fields)
-          enddo
-          call mpp_clock_end  (id2)
-
-          !--- compare checksum
-          do l = 1, num_fields
-             write(text, '(i3.3)') l
-             call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), type//' CGRID X'//text)
-             call compare_checksums( y1(:,:,:,l), b1(:,:,:,l), type//' CGRID Y'//text)
-          enddo
-          if(mix_2D_3D) then
-             call compare_checksums( x1(:,:,1,:), a1_2D(:,:,:), type//' BGRID X 2D')
-             call compare_checksums( y1(:,:,1,:), b1_2D(:,:,:), type//' BGRID Y 2D')
-          endif
-       enddo
-       deallocate(x1, y1, a1, b1)
-       if(mix_2D_3D) deallocate(a1_2D, b1_2D)
-    endif
-
-    call mpp_clock_begin(id_single)
-    call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=CGRID_NE)
-    call mpp_clock_end  (id_single)
-
-    !--- compare checksum
-
-    call compare_checksums( x(:,:,:,1), a(:,:,:,1), type//' CGRID X')
-    call compare_checksums( y(:,:,:,1), b(:,:,:,1), type//' CGRID Y')
-
-    deallocate(x, y, a, b, x_save, y_save)
-
-
-    !------------------------------------------------------------------
-    !              vector update : AGRID vector and scalar pair
-    !------------------------------------------------------------------
-    allocate( x (ism:iem,jsm:jem,nz,ntile_per_pe) )
-    allocate( y (ism:iem,jsm:jem,nz,ntile_per_pe) )
-    allocate( a (ism:iem,jsm:jem,nz,ntile_per_pe) )
-    allocate( b (ism:iem,jsm:jem,nz,ntile_per_pe) )
-    allocate( x_save (ism:iem,jsm:jem,nz,ntile_per_pe) )
-    allocate( y_save (ism:iem,jsm:jem,nz,ntile_per_pe) )
-
-
-    x = 0
-    y = 0
-    do l = 1, ntile_per_pe
-       do k = 1, nz
-          do j = jsc, jec
-             do i = isc, iec+shift
-                x(i,j,k,l) = 1.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
-             end do
-          end do
-          do j = jsc, jec+shift
-             do i = isc, iec
-                y(i,j,k,l) = 2.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
-             end do
-          end do
-       end do
-    enddo
-
-    a  = x; b  = y
-    x_save  = x; y_save  = y
-
-    call mpp_update_domains( x, y, domain, gridtype=AGRID)
-
-    id_update_single =  mpp_start_update_domains(a, b, domain, gridtype=AGRID)
-    call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=AGRID)
-
-    !--- compare checksum
-    call compare_checksums( x(:,:,:,1), a(:,:,:,1), type//' AGRID X')
-    call compare_checksums( y(:,:,:,1), b(:,:,:,1), type//' AGRID Y')
-
-    x = x_save; y = y_save
-    a = x_save; b = y_save
-
-    call mpp_update_domains( x, y, domain, gridtype=AGRID, flags = SCALAR_PAIR)
-
-    id_update_single =  mpp_start_update_domains(a, b, domain, gridtype=AGRID, flags = SCALAR_PAIR)
-    call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=AGRID, flags = SCALAR_PAIR)
-
-    !--- compare checksum
-    call compare_checksums( x(:,:,:,1), a(:,:,:,1), type//' AGRID SCALAR-PAIR X')
-    call compare_checksums( y(:,:,:,1), b(:,:,:,1), type//' AGRID SCALAR-PAIR Y')
-
-    deallocate(x, y, a, b, x_save, y_save)
-
-    nx = nx_save
-    ny = ny_save
-
-    deallocate(layout2D, global_indices, pe_start, pe_end, tile1, tile2)
-    deallocate(istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2 )
-
-
-  end subroutine update_domains_performance
-
+  end subroutine test_mpp_global_sum
 
   !###############################################################
-  subroutine test_mpp_global_sum( type )
+  subroutine test_group_update( type )
     character(len=*), intent(in) :: type
 
     type(domain2D) :: domain
@@ -2341,15 +1904,16 @@ subroutine test_mpp_global_sum( type )
     integer, allocatable, dimension(:)       :: istart1, iend1, jstart1, jend1
     integer, allocatable, dimension(:)       :: istart2, iend2, jstart2, jend2
     integer, allocatable, dimension(:,:)     :: layout2D, global_indices
-    real,    allocatable, dimension(:,:,:)   :: data_3D
-    real,    allocatable, dimension(:,:)     :: data_2D
-
-    integer(kind=8)    :: mold
-    logical            :: folded_north, cubic_grid
+    real,    allocatable, dimension(:,:,:,:) :: x1, y1, x2, y2
+    real,    allocatable, dimension(:,:,:,:) :: a1, a2
+    real,    allocatable, dimension(:,:,:)   :: base
+    integer            :: id1, id2, id3
+    logical            :: folded_north
+    logical            :: cubic_grid
     character(len=3)   :: text
     integer            :: nx_save, ny_save
-    integer            :: id1, id2, id3, id4
-    real               :: gsum1, gsum2, gsum3, gsum4
+    type(mpp_group_update_type) :: group_update
+    type(mpp_group_update_type), allocatable :: update_list(:)
 
     folded_north       = .false.
     cubic_grid         = .false.
@@ -2388,7 +1952,7 @@ subroutine test_mpp_global_sum( type )
        cubic_grid = .true.
        if( mod(npes, ntiles) == 0 ) then
           npes_per_tile = npes/ntiles
-          write(outunit,*)'NOTE from test_mpp_global_sum ==> For Mosaic "', trim(type), &
+          write(outunit,*)'NOTE from update_domains_performance ==> For Mosaic "', trim(type), &
                '", each tile will be distributed over ', npes_per_tile, ' processors.'
        else
           call mpp_error(NOTE,'test_group_update: npes should be multiple of ntiles No test is done for '//trim(type))
@@ -2400,7 +1964,7 @@ subroutine test_mpp_global_sum( type )
           call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
        endif
     case default
-       call mpp_error(FATAL, 'test_mpp_global_sum: no such test: '//type)
+       call mpp_error(FATAL, 'test_group_update: no such test: '//type)
     end select
 
     allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
@@ -2440,296 +2004,75 @@ subroutine test_mpp_global_sum( type )
     !--- setup data
     call mpp_get_compute_domain( domain, isc, iec, jsc, jec )
     call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+    call mpp_get_memory_domain   ( domain, ism, iem, jsm, jem )
 
-    allocate(data_2d(isd:ied,jsd:jed))
-    allocate(data_3d(isd:ied,jsd:jed,nz))
+    if(num_fields<1) then
+       call mpp_error(FATAL, "test_mpp_domains: num_fields must be a positive integer")
+    endif
 
-    do k = 1, nz
-       do j = jsd, jed
-          do i = isd, ied
-             data_3d(i,j,k) =  k*1e3 + i + j*1e-3
-          enddo
-       enddo
-    enddo
+    allocate(update_list(num_fields))
 
-    do j = jsd, jed
-       do i = isd, ied
-          data_2d(i,j) =  i*1e3 + j*1e-3
+    id1 = mpp_clock_id( type//' group 2D', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( type//' non-group 2D', flags=MPP_CLOCK_SYNC )
+    id3 = mpp_clock_id( type//' non-block group 2D', flags=MPP_CLOCK_SYNC )
+
+    allocate( a1(ism:iem,      jsm:jem,       nz, num_fields) )
+    allocate( x1(ism:iem+shift,jsm:jem,       nz, num_fields) )
+    allocate( y1(ism:iem,      jsm:jem+shift, nz, num_fields) )
+    allocate( a2(ism:iem,      jsm:jem,       nz, num_fields) )
+    allocate( x2(ism:iem+shift,jsm:jem,       nz, num_fields) )
+    allocate( y2(ism:iem,      jsm:jem+shift, nz, num_fields) )
+    allocate( base(isc:iec+shift,jsc:jec+shift,nz) )
+    a1 = 0; x1 = 0; y1 = 0
+
+    base = 0
+    do k = 1,nz
+       do j = jsc, jec+shift
+          do i = isc, iec+shift
+             base(i,j,k) = k + i*1e-3 + j*1e-6
+          end do
+       end do
+    end do
+
+    !--- Test for partial direction update
+    do l =1, num_fields
+       call mpp_create_group_update(group_update, a1(:,:,:,l), domain, flags=WUPDATE+SUPDATE)
+    end do
+
+    do l = 1, num_fields
+       a1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3
+       do k=1,nz
+          do i=isc-1,iec+1
+             a1(i,jsc-1,k,l) = 999;
+             a1(i,jec+1,k,l) = 999;
+          enddo
+          do j=jsc,jec
+             a1(isc-1,j,k,l) = 999
+             a1(iec+1,j,k,l) = 999
+          enddo
        enddo
     enddo
 
-    id1 = mpp_clock_id( type//' bitwise sum 3D', flags=MPP_CLOCK_SYNC )
-    id2 = mpp_clock_id( type//' EFP sum 3D', flags=MPP_CLOCK_SYNC )
-    id3 = mpp_clock_id( type//' EFP sum 3D check', flags=MPP_CLOCK_SYNC )
-    id4 = mpp_clock_id( type//' non-bitwise sum 3D', flags=MPP_CLOCK_SYNC )
+    a2 = a1
+    call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1))
 
-    call mpp_clock_begin(id1)
-    do n = 1, num_iter
-       gsum1 = mpp_global_sum(domain, data_3d, flags=BITWISE_EXACT_SUM)
+    do l = 1, num_fields
+       call mpp_update_domains( a2(:,:,:,l), domain, flags=WUPDATE+SUPDATE, complete=l==num_fields )
     enddo
-    call mpp_clock_end(id1)
 
-    call mpp_clock_begin(id2)
-    do n = 1, num_iter
-       gsum2 = mpp_global_sum(domain, data_3d, flags=BITWISE_EFP_SUM)
+    do l = 1, num_fields
+       write(text, '(i3.3)') l
+       call compare_checksums(a1(isd:ied,jsd:jed,:,l),a2(isd:ied,jsd:jed,:,l),type//' CENTER South West '//text)
     enddo
-    call mpp_clock_end(id2)
 
-    call mpp_clock_begin(id3)
-    do n = 1, num_iter
-       gsum3 = mpp_global_sum(domain, data_3d, flags=BITWISE_EFP_SUM, overflow_check=.true. )
-    enddo
-    call mpp_clock_end(id3)
+    call mpp_clear_group_update(group_update)
 
-    call mpp_clock_begin(id4)
-    do n = 1, num_iter
-       gsum4= mpp_global_sum(domain, data_3d)
-    enddo
-    call mpp_clock_end(id4)
-
-    write(outunit, *) " ********************************************************************************"
-    write(outunit, *) " global sum for "//type//' bitwise exact sum 3D = ', gsum1
-    write(outunit, *) " global sum for "//type//' bitwise EFP sum 3D = ', gsum2
-    write(outunit, *) " global sum for "//type//' bitwise EFP sum 3D with overflow_check = ', gsum3
-    write(outunit, *) " global sum for "//type//' non-bitwise sum 3D = ', gsum4
-    write(outunit, *) " "
-    write(outunit, *) " chksum for "//type//' bitwise exact sum 3D = ', transfer(gsum1, mold)
-    write(outunit, *) " chksum for "//type//' bitwise EFP sum 3D = ', transfer(gsum2, mold)
-    write(outunit, *) " chksum for "//type//' bitwise EFP sum 3D with overflow_check = ', transfer(gsum3, mold)
-    write(outunit, *) " chksum for "//type//' non-bitwise sum 3D = ', transfer(gsum4, mold)
-    write(outunit, *) " ********************************************************************************"
-
-    id1 = mpp_clock_id( type//' bitwise sum 2D', flags=MPP_CLOCK_SYNC )
-    id2 = mpp_clock_id( type//' EFP sum 2D', flags=MPP_CLOCK_SYNC )
-    id3 = mpp_clock_id( type//' EFP sum 2D check', flags=MPP_CLOCK_SYNC )
-    id4 = mpp_clock_id( type//' non-bitwise sum 2D', flags=MPP_CLOCK_SYNC )
-
-    call mpp_clock_begin(id1)
-    do n = 1, num_iter
-       gsum1 = mpp_global_sum(domain, data_2d, flags=BITWISE_EXACT_SUM)
-    enddo
-    call mpp_clock_end(id1)
-
-    call mpp_clock_begin(id2)
-    do n = 1, num_iter
-       gsum2 = mpp_global_sum(domain, data_2d, flags=BITWISE_EFP_SUM)
-    enddo
-    call mpp_clock_end(id2)
-
-    call mpp_clock_begin(id3)
-    do n = 1, num_iter
-       gsum3 = mpp_global_sum(domain, data_2d, flags=BITWISE_EFP_SUM, overflow_check=.true. )
-    enddo
-    call mpp_clock_end(id3)
-
-    call mpp_clock_begin(id4)
-    do n = 1, num_iter
-       gsum4= mpp_global_sum(domain, data_2d)
-    enddo
-    call mpp_clock_end(id4)
-
-    write(outunit, *) " ********************************************************************************"
-    write(outunit, *) " global sum for "//type//' bitwise exact sum 2D = ', gsum1
-    write(outunit, *) " global sum for "//type//' bitwise EFP sum 2D = ', gsum2
-    write(outunit, *) " global sum for "//type//' bitwise EFP sum 2D with overflow_check = ', gsum3
-    write(outunit, *) " global sum for "//type//' non-bitwise sum 2D = ', gsum4
-    write(outunit, *) " "
-    write(outunit, *) " chksum for "//type//' bitwise exact sum 2D = ', transfer(gsum1, mold)
-    write(outunit, *) " chksum for "//type//' bitwise EFP sum 2D = ', transfer(gsum2, mold)
-    write(outunit, *) " chksum for "//type//' bitwise EFP sum 2D with overflow_check = ', transfer(gsum3, mold)
-    write(outunit, *) " chksum for "//type//' non-bitwise sum 2D = ', transfer(gsum4, mold)
-    write(outunit, *) " ********************************************************************************"
-
-
-
-    nx = nx_save
-    ny = ny_save
-
-  end subroutine test_mpp_global_sum
-
-  !###############################################################
-  subroutine test_group_update( type )
-    character(len=*), intent(in) :: type
-
-    type(domain2D) :: domain
-    integer        :: num_contact, ntiles, npes_per_tile
-    integer        :: i, j, k, l, n, shift
-    integer        :: isc, iec, jsc, jec, isd, ied, jsd, jed
-    integer        :: ism, iem, jsm, jem
-
-    integer, allocatable, dimension(:)       :: pe_start, pe_end, tile1, tile2
-    integer, allocatable, dimension(:)       :: istart1, iend1, jstart1, jend1
-    integer, allocatable, dimension(:)       :: istart2, iend2, jstart2, jend2
-    integer, allocatable, dimension(:,:)     :: layout2D, global_indices
-    real,    allocatable, dimension(:,:,:,:) :: x1, y1, x2, y2
-    real,    allocatable, dimension(:,:,:,:) :: a1, a2
-    real,    allocatable, dimension(:,:,:)   :: base
-    integer            :: id1, id2, id3
-    logical            :: folded_north
-    logical            :: cubic_grid
-    character(len=3)   :: text
-    integer            :: nx_save, ny_save
-    type(mpp_group_update_type) :: group_update
-    type(mpp_group_update_type), allocatable :: update_list(:)
-
-    folded_north       = .false.
-    cubic_grid         = .false.
-
-    nx_save = nx
-    ny_save = ny
-    !--- check the type
-    select case(type)
-    case ( 'Folded-north' )
-       ntiles = 1
-       shift = 0
-       num_contact = 2
-       folded_north = .true.
-       npes_per_tile = npes
-       if(layout_tripolar(1)*layout_tripolar(2) == npes ) then
-          layout = layout_tripolar
-       else
-          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
-       endif
-    case ( 'Cubic-Grid' )
-       if( nx_cubic == 0 ) then
-          call mpp_error(NOTE,'test_group_update: for Cubic_grid mosaic, nx_cubic is zero, '//&
-               'No test is done for Cubic-Grid mosaic. ' )
-          return
-       endif
-       if( nx_cubic .NE. ny_cubic ) then
-          call mpp_error(NOTE,'test_group_update: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
-               'No test is done for Cubic-Grid mosaic. ' )
-          return
-       endif
-       shift = 1
-       nx = nx_cubic
-       ny = ny_cubic
-       ntiles = 6
-       num_contact = 12
-       cubic_grid = .true.
-       if( mod(npes, ntiles) == 0 ) then
-          npes_per_tile = npes/ntiles
-          write(outunit,*)'NOTE from update_domains_performance ==> For Mosaic "', trim(type), &
-               '", each tile will be distributed over ', npes_per_tile, ' processors.'
-       else
-          call mpp_error(NOTE,'test_group_update: npes should be multiple of ntiles No test is done for '//trim(type))
-          return
-       endif
-       if(layout_cubic(1)*layout_cubic(2) == npes_per_tile) then
-          layout = layout_cubic
-       else
-          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
-       endif
-    case default
-       call mpp_error(FATAL, 'test_group_update: no such test: '//type)
-    end select
-
-    allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
-    do n = 1, ntiles
-       pe_start(n) = (n-1)*npes_per_tile
-       pe_end(n)   = n*npes_per_tile-1
-    end do
-
-    do n = 1, ntiles
-       global_indices(:,n) = (/1,nx,1,ny/)
-       layout2D(:,n)         = layout
-    end do
-
-    allocate(tile1(num_contact), tile2(num_contact) )
-    allocate(istart1(num_contact), iend1(num_contact), jstart1(num_contact), jend1(num_contact) )
-    allocate(istart2(num_contact), iend2(num_contact), jstart2(num_contact), jend2(num_contact) )
-
-    !--- define domain
-    if(folded_north) then
-       !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)  --- cyclic
-       tile1(1) = 1; tile2(1) = 1
-       istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
-       istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
-       !--- Contact line 2, between tile 1 (NORTH) and tile 1 (NORTH)  --- folded-north-edge
-       tile1(2) = 1; tile2(2) = 1
-       istart1(2) = 1;  iend1(2) = nx/2;   jstart1(2) = ny;  jend1(2) = ny
-       istart2(2) = nx; iend2(2) = nx/2+1; jstart2(2) = ny;  jend2(2) = ny
-       call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
-            istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
-            pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
-            name = type, symmetry = .false.  )
-    else if( cubic_grid ) then
-       call define_cubic_mosaic(type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
-            global_indices, layout2D, pe_start, pe_end )
-    endif
-
-    !--- setup data
-    call mpp_get_compute_domain( domain, isc, iec, jsc, jec )
-    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
-    call mpp_get_memory_domain   ( domain, ism, iem, jsm, jem )
-
-    if(num_fields<1) then
-       call mpp_error(FATAL, "test_mpp_domains: num_fields must be a positive integer")
-    endif
-
-    allocate(update_list(num_fields))
-
-    id1 = mpp_clock_id( type//' group 2D', flags=MPP_CLOCK_SYNC )
-    id2 = mpp_clock_id( type//' non-group 2D', flags=MPP_CLOCK_SYNC )
-    id3 = mpp_clock_id( type//' non-block group 2D', flags=MPP_CLOCK_SYNC )
-
-    allocate( a1(ism:iem,      jsm:jem,       nz, num_fields) )
-    allocate( x1(ism:iem+shift,jsm:jem,       nz, num_fields) )
-    allocate( y1(ism:iem,      jsm:jem+shift, nz, num_fields) )
-    allocate( a2(ism:iem,      jsm:jem,       nz, num_fields) )
-    allocate( x2(ism:iem+shift,jsm:jem,       nz, num_fields) )
-    allocate( y2(ism:iem,      jsm:jem+shift, nz, num_fields) )
-    allocate( base(isc:iec+shift,jsc:jec+shift,nz) )
-    a1 = 0; x1 = 0; y1 = 0
-
-    base = 0
-    do k = 1,nz
-       do j = jsc, jec+shift
-          do i = isc, iec+shift
-             base(i,j,k) = k + i*1e-3 + j*1e-6
-          end do
-       end do
-    end do
-
-    !--- Test for partial direction update
-    do l =1, num_fields
-       call mpp_create_group_update(group_update, a1(:,:,:,l), domain, flags=WUPDATE+SUPDATE)
-    end do
-
-    do l = 1, num_fields
-       a1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3
-       do k=1,nz
-          do i=isc-1,iec+1
-             a1(i,jsc-1,k,l) = 999;
-             a1(i,jec+1,k,l) = 999;
-          enddo
-          do j=jsc,jec
-             a1(isc-1,j,k,l) = 999
-             a1(iec+1,j,k,l) = 999
-          enddo
-       enddo
-    enddo
-
-    a2 = a1
-    call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1))
-
-    do l = 1, num_fields
-       call mpp_update_domains( a2(:,:,:,l), domain, flags=WUPDATE+SUPDATE, complete=l==num_fields )
-    enddo
-
-    do l = 1, num_fields
-       write(text, '(i3.3)') l
-       call compare_checksums(a1(isd:ied,jsd:jed,:,l),a2(isd:ied,jsd:jed,:,l),type//' CENTER South West '//text)
-    enddo
-
-    call mpp_clear_group_update(group_update)
-
-    !--- Test for DGRID update
-    if(type == 'Cubic-Grid' ) then
-       x1 = 0; y1 = 0
-       do l =1, num_fields
-          call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=DGRID_NE)
-       end do
+    !--- Test for DGRID update
+    if(type == 'Cubic-Grid' ) then
+       x1 = 0; y1 = 0
+       do l =1, num_fields
+          call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=DGRID_NE)
+       end do
 
        do l = 1, num_fields
           y1(isc:iec+shift,jsc:jec,      :,l) = base(isc:iec+shift,jsc:jec,      :) + l*1e3 + 1e6
@@ -3412,348 +2755,12 @@ subroutine test_halosize_update( type )
 
 end subroutine test_halosize_update
 
-  !###############################################################
-  subroutine test_unstruct_update( type )
-    character(len=*), intent(in) :: type
-
-    type(domain2D) :: SG_domain
-    type(domainUG) :: UG_domain
-    integer        :: num_contact, ntiles, npes_per_tile
-    integer        :: i, j, k, l, n, shift
-    integer        :: isc, iec, jsc, jec, isd, ied, jsd, jed
-    integer        :: ism, iem, jsm, jem, lsg, leg
+  !#################################################################################
 
-    integer, allocatable, dimension(:)       :: pe_start, pe_end, npts_tile, grid_index, ntiles_grid
-    integer, allocatable, dimension(:,:)     :: layout2D, global_indices
-    real,    allocatable, dimension(:,:)     :: x1, x2, g1, g2
-    real,    allocatable, dimension(:,:,:)   :: a1, a2, gdata
-    real,    allocatable, dimension(:,:)     :: rmask
-    real,    allocatable, dimension(:)       :: frac_crit
-    logical, allocatable, dimension(:,:,:)   :: lmask
-    integer, allocatable, dimension(:)       :: isl, iel, jsl, jel
-    logical            :: cubic_grid
-    character(len=3)   :: text
-    integer            :: nx_save, ny_save, tile
-    integer            :: ntotal_land, istart, iend, pos
-
-    cubic_grid         = .false.
-
-    nx_save = nx
-    ny_save = ny
-    !--- check the type
-    select case(type)
-    case ( 'Cubic-Grid' )
-       if( nx_cubic == 0 ) then
-          call mpp_error(NOTE,'test_unstruct_update: for Cubic_grid mosaic, nx_cubic is zero, '//&
-               'No test is done for Cubic-Grid mosaic. ' )
-          return
-       endif
-       if( nx_cubic .NE. ny_cubic ) then
-          call mpp_error(NOTE,'test_unstruct_update: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
-               'No test is done for Cubic-Grid mosaic. ' )
-          return
-       endif
-       nx = nx_cubic
-       ny = ny_cubic
-       ntiles = 6
-       num_contact = 12
-       cubic_grid = .true.
-       if( mod(npes, ntiles) == 0 ) then
-          npes_per_tile = npes/ntiles
-          write(outunit,*)'NOTE from test_unstruct_update ==> For Mosaic "', trim(type), &
-               '", each tile will be distributed over ', npes_per_tile, ' processors.'
-       else
-          call mpp_error(NOTE,'test_unstruct_update: npes should be multiple of ntiles No test is done for '//trim(type))
-          return
-       endif
-       if(layout_cubic(1)*layout_cubic(2) == npes_per_tile) then
-          layout = layout_cubic
-       else
-          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
-       endif
-       allocate(frac_crit(ntiles))
-       frac_crit(1) = 0.3; frac_crit(2) = 0.1; frac_crit(3) = 0.6
-       frac_crit(4) = 0.2; frac_crit(5) = 0.4; frac_crit(6) = 0.5
-
-    case default
-       call mpp_error(FATAL, 'test_group_update: no such test: '//type)
-    end select
-
-    allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
-    do n = 1, ntiles
-       pe_start(n) = (n-1)*npes_per_tile
-       pe_end(n)   = n*npes_per_tile-1
-    end do
-
-    do n = 1, ntiles
-       global_indices(:,n) = (/1,nx,1,ny/)
-       layout2D(:,n)         = layout
-    end do
-
-    !--- define domain
-    if( cubic_grid ) then
-       call define_cubic_mosaic(type, SG_domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
-            global_indices, layout2D, pe_start, pe_end )
-    endif
-
-    !--- setup data
-    call mpp_get_compute_domain( SG_domain, isc, iec, jsc, jec )
-    call mpp_get_data_domain   ( SG_domain, isd, ied, jsd, jed )
-
-    allocate(lmask(nx,ny,ntiles))
-    allocate(npts_tile(ntiles))
-    lmask = .false.
-    if(mpp_pe() == mpp_root_pe() ) then
-       allocate(rmask(nx,ny))
-       !--- construct gmask.
-       do n = 1, ntiles
-          call random_number(rmask)
-          do j = 1, ny
-             do i = 1, nx
-                if(rmask(i,j) > frac_crit(n)) then
-                   lmask(i,j,n) = .true.
-                endif
-             enddo
-          enddo
-          npts_tile(n) = count(lmask(:,:,n))
-       enddo
-       ntotal_land = sum(npts_tile)
-       allocate(grid_index(ntotal_land))
-       l = 0
-       allocate(isl(0:mpp_npes()-1), iel(0:mpp_npes()-1))
-       allocate(jsl(0:mpp_npes()-1), jel(0:mpp_npes()-1))
-       call mpp_get_compute_domains(SG_domain,xbegin=isl,xend=iel,ybegin=jsl,yend=jel)
-
-       do n = 1, ntiles
-          do j = 1, ny
-             do i = 1, nx
-                if(lmask(i,j,n)) then
-                   l = l + 1
-                   grid_index(l) = (j-1)*nx+i
-                endif
-             enddo
-          enddo
-       enddo
-       deallocate(rmask, isl, iel, jsl, jel)
-    endif
-    call mpp_broadcast(npts_tile, ntiles, mpp_root_pe())
-    if(mpp_pe() .NE. mpp_root_pe()) then
-       ntotal_land = sum(npts_tile)
-       allocate(grid_index(ntotal_land))
-    endif
-    call mpp_broadcast(grid_index, ntotal_land, mpp_root_pe())
-
-    allocate(ntiles_grid(ntotal_land))
-    ntiles_grid = 1
-   !--- define the unstructured grid domain
-    call mpp_define_unstruct_domain(UG_domain, SG_domain, npts_tile, ntiles_grid, mpp_npes(), 1, grid_index, name="LAND unstruct")
-    call mpp_get_UG_compute_domain(UG_domain, istart, iend)
-
-    !--- figure out lmask according to grid_index
-    pos = 0
-    do n = 1, ntiles
-       do l = 1, npts_tile(n)
-          pos = pos + 1
-          j = (grid_index(pos)-1)/nx + 1
-          i = mod((grid_index(pos)-1),nx) + 1
-          lmask(i,j,n) = .true.
-       enddo
-    enddo
-
-    !--- set up data
-    allocate(gdata(nx,ny,ntiles))
-    gdata = -999
-    do n = 1, ntiles
-       do j = 1, ny
-          do i = 1, nx
-             if(lmask(i,j,n)) then
-                gdata(i,j,n) = n*1.e+3 + i + j*1.e-3
-             endif
-          end do
-       end do
-    end do
-
-    !--- test the 2-D data is on computing domain
-    allocate( a1(isc:iec, jsc:jec,1), a2(isc:iec,jsc:jec,1 ) )
-
-    tile = mpp_pe()/npes_per_tile + 1
-    do j = jsc, jec
-       do i = isc, iec
-          a1(i,j,1) = gdata(i,j,tile)
-       enddo
-    enddo
-    a2 = -999
-
-    allocate(x1(istart:iend,1), x2(istart:iend,1))
-    x1 = -999
-    x2 = -999
-    !--- fill the value of x2
-    tile = mpp_get_UG_domain_tile_id(UG_domain)
-    pos = 0
-    do n = 1, tile-1
-       pos = pos + npts_tile(n)
-    enddo
-    do l = istart, iend
-       i = mod((grid_index(pos+l)-1), nx) + 1
-       j = (grid_index(pos+l)-1)/nx + 1
-       x2(l,1) = gdata(i,j,tile)
-    enddo
-
-    call mpp_pass_SG_to_UG(UG_domain, a1(:,:,1), x1(:,1))
-    call compare_checksums_2D(x1, x2, type//' SG2UG 2-D compute domain')
-    call mpp_pass_UG_to_SG(UG_domain, x1(:,1), a2(:,:,1))
-
-    call compare_checksums(a1(:,:,1:1),a2(:,:,1:1),type//' UG2SG 2-D compute domain')
-    deallocate(a1,a2,x1,x2)
-
-    !--- test the 3-D data is on computing domain
-    allocate( a1(isc:iec, jsc:jec,nz), a2(isc:iec,jsc:jec,nz ) )
-
-    tile = mpp_pe()/npes_per_tile + 1
-    do k = 1, nz
-       do j = jsc, jec
-          do i = isc, iec
-             a1(i,j,k) = gdata(i,j,tile)
-             if(a1(i,j,k) .NE. -999) a1(i,j,k) = a1(i,j,k) + k*1.e-6
-          enddo
-       enddo
-    enddo
-    a2 = -999
-
-    allocate(x1(istart:iend,nz), x2(istart:iend,nz))
-    x1 = -999
-    x2 = -999
-    !--- fill the value of x2
-    tile = mpp_get_UG_domain_tile_id(UG_domain)
-    pos = 0
-    do n = 1, tile-1
-       pos = pos + npts_tile(n)
-    enddo
-    do l = istart, iend
-       i = mod((grid_index(pos+l)-1), nx) + 1
-       j = (grid_index(pos+l)-1)/nx + 1
-       do k = 1, nz
-          x2(l,k) = gdata(i,j,tile) + k*1.e-6
-       enddo
-    enddo
-
-    call mpp_pass_SG_to_UG(UG_domain, a1, x1)
-    call compare_checksums_2D(x1, x2, type//' SG2UG 3-D compute domain')
-    call mpp_pass_UG_to_SG(UG_domain, x1, a2)
-
-    call compare_checksums(a1,a2,type//' UG2SG 3-D compute domain')
-    deallocate(a1,a2,x1,x2)
-
-    !--- test the 2-D data is on data domain
-    allocate( a1(isd:ied, jsd:jed,1), a2(isd:ied,jsd:jed,1 ) )
-    a1 = -999; a2 = -999
-
-    tile = mpp_pe()/npes_per_tile + 1
-    do j = jsc, jec
-       do i = isc, iec
-          a1(i,j,1) = gdata(i,j,tile)
-       enddo
-    enddo
-    a2 = -999
-
-    allocate(x1(istart:iend,1), x2(istart:iend,1))
-    x1 = -999
-    x2 = -999
-    !--- fill the value of x2
-    tile = mpp_get_UG_domain_tile_id(UG_domain)
-    pos = 0
-    do n = 1, tile-1
-       pos = pos + npts_tile(n)
-    enddo
-    do l = istart, iend
-       i = mod((grid_index(pos+l)-1), nx) + 1
-       j = (grid_index(pos+l)-1)/nx + 1
-       x2(l,1) = gdata(i,j,tile)
-    enddo
-
-    call mpp_pass_SG_to_UG(UG_domain, a1(:,:,1), x1(:,1))
-    call compare_checksums_2D(x1, x2, type//' SG2UG 2-D data domain')
-    call mpp_pass_UG_to_SG(UG_domain, x1(:,1), a2(:,:,1))
-
-    call compare_checksums(a1(:,:,1:1),a2(:,:,1:1),type//' UG2SG 2-D data domain')
-    deallocate(a1,a2,x1,x2)
-
-    !--- test the 3-D data is on computing domain
-    allocate( a1(isd:ied, jsd:jed,nz), a2(isd:ied,jsd:jed,nz ) )
-    a1 = -999; a2 = -999
-
-    tile = mpp_pe()/npes_per_tile + 1
-    do k = 1, nz
-       do j = jsc, jec
-          do i = isc, iec
-             a1(i,j,k) = gdata(i,j,tile)
-             if(a1(i,j,k) .NE. -999) a1(i,j,k) = a1(i,j,k) + k*1.e-6
-          enddo
-       enddo
-    enddo
-    a2 = -999
-
-    allocate(x1(istart:iend,nz), x2(istart:iend,nz))
-    x1 = -999
-    x2 = -999
-    !--- fill the value of x2
-    tile = mpp_get_UG_domain_tile_id(UG_domain)
-    pos = 0
-    do n = 1, tile-1
-       pos = pos + npts_tile(n)
-    enddo
-    do l = istart, iend
-       i = mod((grid_index(pos+l)-1), nx) + 1
-       j = (grid_index(pos+l)-1)/nx + 1
-       do k = 1, nz
-          x2(l,k) = gdata(i,j,tile) + k*1.e-6
-       enddo
-    enddo
-
-    call mpp_pass_SG_to_UG(UG_domain, a1, x1)
-    call compare_checksums_2D(x1, x2, type//' SG2UG 3-D data domain')
-    call mpp_pass_UG_to_SG(UG_domain, x1, a2)
-
-    call compare_checksums(a1,a2,type//' UG2SG 3-D data domain')
-    deallocate(a1,a2,x1,x2)
-
-    !----------------------------------------------------------------
-    !    test mpp_global_field_ug
-    !----------------------------------------------------------------
-    call mpp_get_UG_global_domain(UG_domain, lsg, leg)
-    tile = mpp_get_UG_domain_tile_id(UG_domain)
-    allocate(g1(lsg:leg,nz), g2(lsg:leg,nz), x1(istart:iend,nz))
-    g1 = 0
-    g2 = 0
-    x1 = 0
-    do k = 1, nz
-       do l = lsg, leg
-          g1(l,k) = tile*1e6 + l + k*1.e-3
-       enddo
-       do l = istart, iend
-          x1(l,k) = g1(l,k)
-       enddo
-    enddo
-
-    call mpp_global_field_ug(UG_domain, x1, g2)
-    call compare_checksums_2D(g1,g2,type//' global_field_ug 3-D')
-
-    g2 = 0.0
-    call mpp_global_field_ug(UG_domain, x1(:,1), g2(:,1))
-    call compare_checksums_2D(g1(:,1:1),g2(:,1:1),type//' global_field_ug 2-D')
-
-    deallocate(g1,g2,x1)
-
-  end subroutine test_unstruct_update
-
-
-
-  !#################################################################################
-
-  subroutine fill_halo_zero(data, whalo, ehalo, shalo, nhalo, xshift, yshift, isc, iec, jsc, jec, isd, ied, jsd, jed)
-    integer,                         intent(in) :: isc, iec, jsc, jec, isd, ied, jsd, jed
-    integer,                         intent(in) :: whalo, ehalo, shalo, nhalo, xshift, yshift
-    real, dimension(isd:,jsd:,:), intent(inout) :: data
+  subroutine fill_halo_zero(data, whalo, ehalo, shalo, nhalo, xshift, yshift, isc, iec, jsc, jec, isd, ied, jsd, jed)
+    integer,                         intent(in) :: isc, iec, jsc, jec, isd, ied, jsd, jed
+    integer,                         intent(in) :: whalo, ehalo, shalo, nhalo, xshift, yshift
+    real, dimension(isd:,jsd:,:), intent(inout) :: data
 
     if(whalo >=0) then
        data(iec+ehalo+1+xshift:ied+xshift,jsd:jed+yshift,:) = 0
@@ -5580,97 +4587,33 @@ subroutine fill_cubicgrid_refined_halo(data, data1_all, data2_all, ni, nj, tile,
 
   end subroutine fill_cubicgrid_refined_halo
 
- !##################################################################################
-  subroutine test_subset_update( )
-    real, allocatable, dimension(:,:,:) :: x
-    type(domain2D) :: domain
-    real,    allocatable :: global(:,:,:)
-    integer              :: i, xhalo, yhalo
-    integer              :: is, ie, js, je, isd, ied, jsd, jed
-!   integer :: pes9(9)=(/1,2,3,4,5,6,7,8,9/)
-    integer :: pes9(9)=(/0,2,4,10,12,14,20,22,24/)
-    integer :: ni, nj
-
-    if(mpp_npes() < 25) then
-       call mpp_error(FATAL,"test_mpp_domains: test_subset_update will&
-            & not be done when npes < 25")
-       return
-    endif
-
-    call mpp_declare_pelist(pes9)
-    if(any(mpp_pe()==pes9)) then
-       call mpp_set_current_pelist(pes9)
-       layout = (/3,3/)
-       ni = 3; nj =3
-       call mpp_define_domains((/1,ni,1,nj/), layout, domain, xhalo=1&
-            &, yhalo=1, xflags=CYCLIC_GLOBAL_DOMAIN, yflags&
-            &=CYCLIC_GLOBAL_DOMAIN, name='subset domain')
-       call mpp_get_compute_domain(domain, is, ie, js, je)
-       print*, "pe=", mpp_pe(), is, ie, js, je
-
-       allocate(global(0:ni+1,0:nj+1,nz) )
-
-       global = 0
-       do k = 1,nz
-          do j = 1,nj
-             do i = 1,ni
-                global(i,j,k) = k + i*1e-3 + j*1e-6
-             end do
-          end do
-       end do
-
-    global(0,      1:nj,:) = global(ni,     1:nj,:)
-    global(ni+1,   1:nj,:) = global(1,      1:nj,:)
-    global(0:ni+1, 0,   :) = global(0:ni+1, nj,  :)
-    global(0:ni+1, nj+1,:) = global(0:ni+1, 1,   :)
-
-    !set up x array
-    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
-    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
-    allocate( x (isd:ied,jsd:jed,nz) )
-
-    x = 0.
-    x (is:ie,js:je,:) = global(is:ie,js:je,:)
-
-!full update
-    call mpp_update_domains( x, domain )
-    call compare_checksums( x, global(isd:ied,jsd:jed,:), '9pe subset' )
+  subroutine set_corner_zero( data, isd, ied, jsd, jed, isc, iec, jsc, jec )
+     integer,                               intent(in) :: isd, ied, jsd, jed
+     integer,                               intent(in) :: isc, iec, jsc, jec
+     real, dimension(isd:,jsd:,:), intent(inout) :: data
 
-    deallocate(x, global)
-    call mpp_deallocate_domain(domain)
-  endif
+    data (isd  :isc-1, jsd  :jsc-1,:) = 0
+    data (isd  :isc-1, jec+1:jed,  :) = 0
+    data (iec+1:ied  , jsd  :jsc-1,:) = 0
+    data (iec+1:ied  , jec+1:jed,  :) = 0
 
-   call mpp_set_current_pelist()
 
-  end subroutine test_subset_update
+  end subroutine set_corner_zero
 
   !##################################################################################
-  subroutine test_halo_update( type )
+  subroutine test_update_edge( type )
     character(len=*), intent(in) :: type
-    real, allocatable, dimension(:,:,:) :: x, x1, x2, x3, x4
-    real, allocatable, dimension(:,:,:) :: y, y1, y2, y3, y4
+    real, allocatable, dimension(:,:,:) :: x, x2, a
+    real, allocatable, dimension(:,:,:) :: y, y2, b
     type(domain2D) :: domain
     real,    allocatable :: global1(:,:,:), global2(:,:,:), global(:,:,:)
     logical, allocatable :: maskmap(:,:)
     integer              :: shift, i, xhalo, yhalo
     logical              :: is_symmetry, folded_south, folded_west, folded_east
     integer              :: is, ie, js, je, isd, ied, jsd, jed
+    integer              :: id_update
 
-    ! when testing maskmap option, nx*ny should be able to be divided by both npes and npes+1
-    if(type == 'Masked' .or. type == 'Masked symmetry') then
-       if(mod(nx*ny, npes) .NE. 0 .OR. mod(nx*ny, npes+1) .NE. 0 ) then
-          call mpp_error(NOTE,'TEST_MPP_DOMAINS: nx*ny can not be divided by both npes and npes+1, '//&
-               'Masked test_halo_update will not be tested')
-          return
-       end if
-    end if
-
-    if(type == 'Folded xy_halo' ) then
-       xhalo = max(whalo, ehalo); yhalo = max(shalo, nhalo)
-       allocate(global(1-xhalo:nx+xhalo,1-yhalo:ny+yhalo,nz) )
-    else
-       allocate(global(1-whalo:nx+ehalo,1-shalo:ny+nhalo,nz) )
-    end if
+    allocate(global(1-whalo:nx+ehalo,1-shalo:ny+nhalo,nz) )
 
     global = 0
     do k = 1,nz
@@ -5687,104 +4630,56 @@ subroutine test_halo_update( type )
        is_symmetry = .true.
     end if
     select case(type)
-    case( 'Simple', 'Simple symmetry' )
-        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
-                                 shalo=shalo, nhalo=nhalo, name=type, symmetry = is_symmetry )
-    case( 'Cyclic', 'Cyclic symmetry' )
+    case( 'Cyclic' )
         call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
         call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,        &
              shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN, &
              name=type, symmetry = is_symmetry )
-        global(1-whalo:0,                 1:ny,:) = global(nx-whalo+1:nx,             1:ny,:)
-        global(nx+1:nx+ehalo,             1:ny,:) = global(1:ehalo,                   1:ny,:)
-        global(1-whalo:nx+ehalo,     1-shalo:0,:) = global(1-whalo:nx+ehalo, ny-shalo+1:ny,:)
-        global(1-whalo:nx+ehalo, ny+1:ny+nhalo,:) = global(1-whalo:nx+ehalo,       1:nhalo,:)
+        global(1-whalo:0,          1:ny,:) = global(nx-whalo+1:nx,             1:ny,:)
+        global(nx+1:nx+ehalo,      1:ny,:) = global(1:ehalo,                   1:ny,:)
+        global(1:nx,          1-shalo:0,:) = global(1:nx,             ny-shalo+1:ny,:)
+        global(1:nx,      ny+1:ny+nhalo,:) = global(1:nx,             1:nhalo,      :)
     case( 'Folded-north', 'Folded-north symmetry' )
         call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
         call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
              shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, &
              name=type, symmetry = is_symmetry  )
         call fill_folded_north_halo(global, 0, 0, 0, 0, 1)
-    case( 'Folded-south symmetry' )
-        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
-             shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_SOUTH_EDGE, &
-             name=type, symmetry = is_symmetry  )
-        call fill_folded_south_halo(global, 0, 0, 0, 0, 1)
-    case( 'Folded-west symmetry' )
-        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
-             shalo=shalo, nhalo=nhalo, xflags=FOLD_WEST_EDGE, yflags=CYCLIC_GLOBAL_DOMAIN, &
-             name=type, symmetry = is_symmetry  )
-        call fill_folded_west_halo(global, 0, 0, 0, 0, 1)
-    case( 'Folded-east symmetry' )
-        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
-             shalo=shalo, nhalo=nhalo, xflags=FOLD_EAST_EDGE, yflags=CYCLIC_GLOBAL_DOMAIN, &
-             name=type, symmetry = is_symmetry  )
-        call fill_folded_east_halo(global, 0, 0, 0, 0, 1)
-    case( 'Folded xy_halo' )
-        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, xhalo=xhalo, yhalo=yhalo,   &
-             xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, name=type, symmetry = is_symmetry  )
-        global(1-xhalo:0,                1:ny,:) = global(nx-xhalo+1:nx,                   1:ny,:)
-        global(nx+1:nx+xhalo,            1:ny,:) = global(1:xhalo,                         1:ny,:)
-        global(1-xhalo:nx+xhalo,ny+1:ny+yhalo,:) = global(nx+xhalo:1-xhalo:-1, ny:ny-yhalo+1:-1,:)
-    case( 'Masked', 'Masked symmetry' )
-!with fold and cyclic, assign to npes+1 and mask out the top-rightdomain
-        call mpp_define_layout( (/1,nx,1,ny/), npes+1, layout )
-        allocate( maskmap(layout(1),layout(2)) )
-        maskmap(:,:) = .TRUE.; maskmap(layout(1),layout(2)) = .FALSE.
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
-             shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, &
-             maskmap=maskmap, name=type, symmetry = is_symmetry  )
-        deallocate(maskmap)
-       !we need to zero out the global data on the missing domain.
-       !this logic assumes top-right, in an even division
-        if( mod(nx,layout(1)).NE.0 .OR. mod(ny,layout(2)).NE.0 )call mpp_error( FATAL, &
-             'TEST_MPP_DOMAINS: test for masked domains needs (nx,ny) to divide evenly on npes+1 PEs.' )
-        global(nx-nx/layout(1)+1:nx,ny-ny/layout(2)+1:ny,:) = 0
-        call fill_folded_north_halo(global, 0, 0, 0, 0, 1)
+        !--- set the corner to 0
+        call set_corner_zero(global, 1-whalo, nx+ehalo, 1-shalo, ny+ehalo, 1, nx, 1, ny)
     case default
-        call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type )
+        call mpp_error( FATAL, 'test_update_edge: no such test: '//type )
     end select
 
 !set up x array
     call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
     call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
     allocate( x (isd:ied,jsd:jed,nz) )
-    allocate( x1(isd:ied,jsd:jed,nz) )
-    allocate( x2(isd:ied,jsd:jed,nz) )
-    allocate( x3(isd:ied,jsd:jed,nz) )
-    allocate( x4(isd:ied,jsd:jed,nz) )
+    allocate( a (isd:ied,jsd:jed,nz) )
+    allocate( x2 (isd:ied,jsd:jed,nz) )
+    x2 (isd:ied,jsd:jed,:) = global(isd:ied,jsd:jed,:)
+    call set_corner_zero(x2, isd, ied, jsd, jed, is, ie, js, je)
+
     x = 0.
     x (is:ie,js:je,:) = global(is:ie,js:je,:)
-    x1 = x; x2 = x; x3 = x; x4 = x
 
 !full update
     id = mpp_clock_id( type, flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
     call mpp_clock_begin(id)
-    call mpp_update_domains( x, domain )
+    call mpp_update_domains( x, domain, flags=EDGEUPDATE)
     call mpp_clock_end  (id)
-    call compare_checksums( x, global(isd:ied,jsd:jed,:), type )
+    call compare_checksums( x, x2, type )
+    deallocate(x2)
 
-!partial update
-    id = mpp_clock_id( type//' partial', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
-    call mpp_clock_begin(id)
-    call mpp_update_domains( x1, domain, NUPDATE+EUPDATE, complete=.false. )
-    call mpp_update_domains( x2, domain, NUPDATE+EUPDATE, complete=.false. )
-    call mpp_update_domains( x3, domain, NUPDATE+EUPDATE, complete=.false. )
-    call mpp_update_domains( x4, domain, NUPDATE+EUPDATE, complete=.true. )
-    call mpp_clock_end  (id)
-    call compare_checksums( x1(is:ied,js:jed,:), global(is:ied,js:jed,:), type//' partial x1' )
-    call compare_checksums( x2(is:ied,js:jed,:), global(is:ied,js:jed,:), type//' partial x2' )
-    call compare_checksums( x3(is:ied,js:jed,:), global(is:ied,js:jed,:), type//' partial x3' )
-    call compare_checksums( x4(is:ied,js:jed,:), global(is:ied,js:jed,:), type//' partial x4' )
+    a = 0
+    a(is:ie,js:je,:) = global(is:ie,js:je,:)
+    id_update = mpp_start_update_domains( a, domain, flags=EDGEUPDATE)
+    call mpp_complete_update_domains(id_update, a, domain, flags=EDGEUPDATE)
+    call compare_checksums( x, a, type//" nonblock")
 
-    !--- test vector update for FOLDED and MASKED case.
-    if(type == 'Simple' .or. type == 'Simple symmetry' .or. type == 'Cyclic' .or. type == 'Cyclic symmetry') then
-       deallocate(x,x1,x2,x3,x4)
+        !--- test vector update for FOLDED and MASKED case.
+    if( type == 'Cyclic' ) then
+       deallocate(global, x, a)
        return
     end if
 
@@ -5804,375 +4699,30 @@ subroutine test_halo_update( type )
              end do
           end do
        end do
-       if(type == 'Masked symmetry') then
-           global(nx-nx/layout(1)+1:nx+1,ny-ny/layout(2)+1:ny+1,:) = 0
-       endif
-       deallocate(x, x1, x2, x3, x4)
+       deallocate(x,a)
        allocate( x (isd:ied+1,jsd:jed+1,nz) )
-       allocate( x1(isd:ied+1,jsd:jed+1,nz) )
-       allocate( x2(isd:ied+1,jsd:jed+1,nz) )
-       allocate( x3(isd:ied+1,jsd:jed+1,nz) )
-       allocate( x4(isd:ied+1,jsd:jed+1,nz) )
+       allocate( a (isd:ied+1,jsd:jed+1,nz) )
     endif
 
-    folded_south = .false.
-    folded_west  = .false.
-    folded_east  = .false.
     select case (type)
-    case ('Folded-north', 'Masked')
+    case ('Folded-north')
        !fill in folded north edge, cyclic east and west edge
        call fill_folded_north_halo(global, 1, 1, 0, 0, -1)
-    case ('Folded xy_halo')
-       !fill in folded north edge, cyclic east and west edge
-       global(1-xhalo:0,                  1:ny,:) =  global(nx-xhalo+1:nx,                     1:ny,:)
-       global(nx+1:nx+xhalo,              1:ny,:) =  global(1:xhalo,                           1:ny,:)
-       global(1-xhalo:nx+xhalo-1,ny+1:ny+yhalo,:) = -global(nx+xhalo-1:1-xhalo:-1,ny-1:ny-yhalo:-1,:)
-       global(nx+xhalo,          ny+1:ny+yhalo,:) = -global(nx-xhalo,             ny-1:ny-yhalo:-1,:)
-    case ('Folded-north symmetry', 'Masked symmetry' )
+    case ('Folded-north symmetry')
        call fill_folded_north_halo(global, 1, 1, 1, 1, -1)
-    case ('Folded-south symmetry' )
-       folded_south = .true.
-       call fill_folded_south_halo(global, 1, 1, 1, 1, -1)
-    case ('Folded-west symmetry' )
-       folded_west = .true.
-       call fill_folded_west_halo(global, 1, 1, 1, 1, -1)
-    case ('Folded-east symmetry' )
-       folded_east = .true.
-       call fill_folded_east_halo(global, 1, 1, 1, 1, -1)
     case default
         call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type )
     end select
 
     x = 0.
+    a = 0.
     x(is:ie+shift,js:je+shift,:) = global(is:ie+shift,js:je+shift,:)
+    a(is:ie+shift,js:je+shift,:) = global(is:ie+shift,js:je+shift,:)
     !set up y array
     allocate( y (isd:ied+shift,jsd:jed+shift,nz) )
-    allocate( y1(isd:ied+shift,jsd:jed+shift,nz) )
-    allocate( y2(isd:ied+shift,jsd:jed+shift,nz) )
-    allocate( y3(isd:ied+shift,jsd:jed+shift,nz) )
-    allocate( y4(isd:ied+shift,jsd:jed+shift,nz) )
-    y = x; x1 = x; x2 = x; x3 = x; x4 = x
-    y = x; y1 = x; y2 = x; y3 = x; y4 = x
-
-    id = mpp_clock_id( type//' vector BGRID_NE', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
-    call mpp_clock_begin(id)
-    call mpp_update_domains( x,  y,  domain, gridtype=BGRID_NE)
-    call mpp_update_domains( x1, y1, domain, gridtype=BGRID_NE, complete=.false. )
-    call mpp_update_domains( x2, y2, domain, gridtype=BGRID_NE, complete=.false. )
-    call mpp_update_domains( x3, y3, domain, gridtype=BGRID_NE, complete=.false. )
-    call mpp_update_domains( x4, y4, domain, gridtype=BGRID_NE, complete=.true.  )
-    call mpp_clock_end  (id)
-
-    !redundant points must be equal and opposite
-
-    if(folded_south) then
-       global(nx/2+shift,                1,:) = 0.  !pole points must have 0 velocity
-       global(nx+shift  ,                1,:) = 0.  !pole points must have 0 velocity
-       global(nx/2+1+shift:nx-1+shift,   1,:) = -global(nx/2-1+shift:1+shift:-1, 1,:)
-       global(1-whalo:shift,             1,:) = -global(nx-whalo+1:nx+shift,     1,:)
-       global(nx+1+shift:nx+ehalo+shift, 1,:) = -global(1+shift:ehalo+shift,     1,:)
-       !--- the following will fix the +0/-0 problem on altix
-       if(shalo >0) global(shift,1,:) = 0.  !pole points must have 0 velocity
-    else if(folded_west) then
-       global(1, ny/2+shift, :) = 0. !pole points must have 0 velocity
-       global(1, ny+shift,   :) = 0. !pole points must have 0 velocity
-       global(1, ny/2+1+shift:ny-1+shift,   :) = -global(1, ny/2-1+shift:1+shift:-1, :)
-       global(1, 1-shalo:shift,             :) = -global(1, ny-shalo+1:ny+shift,     :)
-       global(1, ny+1+shift:ny+nhalo+shift, :) = -global(1, 1+shift:nhalo+shift,     :)
-       !--- the following will fix the +0/-0 problem on altix
-       if(whalo>0) global(1, shift, :) = 0.  !pole points must have 0 velocity
-    else if(folded_east) then
-       global(nx+shift, ny/2+shift, :) = 0. !pole points must have 0 velocity
-       global(nx+shift, ny+shift,   :) = 0. !pole points must have 0 velocity
-       global(nx+shift, ny/2+1+shift:ny-1+shift,   :) = -global(nx+shift, ny/2-1+shift:1+shift:-1, :)
-       global(nx+shift, 1-shalo:shift,             :) = -global(nx+shift, ny-shalo+1:ny+shift,     :)
-       global(nx+shift, ny+1+shift:ny+nhalo+shift, :) = -global(nx+shift, 1+shift:nhalo+shift,     :)
-       if(ehalo >0) global(nx+shift, shift, :) = 0.  !pole points must have 0 velocity
-    else
-       global(nx/2+shift,                ny+shift,:) = 0.  !pole points must have 0 velocity
-       global(nx+shift  ,                ny+shift,:) = 0.  !pole points must have 0 velocity
-       global(nx/2+1+shift:nx-1+shift,   ny+shift,:) = -global(nx/2-1+shift:1+shift:-1, ny+shift,:)
-       if(type == 'Folded xy_halo') then
-          global(1-xhalo:shift,             ny+shift,:) = -global(nx-xhalo+1:nx+shift,     ny+shift,:)
-          global(nx+1+shift:nx+xhalo+shift, ny+shift,:) = -global(1+shift:xhalo+shift,     ny+shift,:)
-       else
-          global(1-whalo:shift,             ny+shift,:) = -global(nx-whalo+1:nx+shift,     ny+shift,:)
-          global(nx+1+shift:nx+ehalo+shift, ny+shift,:) = -global(1+shift:ehalo+shift,     ny+shift,:)
-       end if
-       !--- the following will fix the +0/-0 problem on altix
-       if(nhalo >0) global(shift,ny+shift,:) = 0.  !pole points must have 0 velocity
-    endif
-
-    call compare_checksums( x,  global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE X' )
-    call compare_checksums( y,  global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE Y' )
-    call compare_checksums( x1, global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE X1' )
-    call compare_checksums( x2, global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE X2' )
-    call compare_checksums( x3, global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE X3' )
-    call compare_checksums( x4, global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE X4' )
-    call compare_checksums( y1, global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE Y1' )
-    call compare_checksums( y2, global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE Y2' )
-    call compare_checksums( y3, global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE Y3' )
-    call compare_checksums( y4, global(isd:ied+shift,jsd:jed+shift,:), type//' BGRID_NE Y4' )
-
-    deallocate(global, x, x1, x2, x3, x4, y, y1, y2, y3, y4)
-
-    !------------------------------------------------------------------
-    !              vector update : CGRID_NE
-    !------------------------------------------------------------------
-    !--- global1 is x-component and global2 is y-component
-    if(type == 'Folded xy_halo') then
-       allocate(global1(1-xhalo:nx+xhalo, 1-yhalo:ny+yhalo, nz))
-       allocate(global2(1-xhalo:nx+xhalo, 1-yhalo:ny+yhalo, nz))
-    else
-       allocate(global1(1-whalo:nx+ehalo+shift, 1-shalo:ny+nhalo, nz))
-       allocate(global2(1-whalo:nx+ehalo, 1-shalo:ny+nhalo+shift, nz))
-    end if
-    allocate(x (isd:ied+shift,jsd:jed,nz), y (isd:ied,jsd:jed+shift,nz) )
-    allocate(x1(isd:ied+shift,jsd:jed,nz), y1(isd:ied,jsd:jed+shift,nz) )
-    allocate(x2(isd:ied+shift,jsd:jed,nz), y2(isd:ied,jsd:jed+shift,nz) )
-    allocate(x3(isd:ied+shift,jsd:jed,nz), y3(isd:ied,jsd:jed+shift,nz) )
-    allocate(x4(isd:ied+shift,jsd:jed,nz), y4(isd:ied,jsd:jed+shift,nz) )
-
-    global1 = 0.0
-    global2 = 0.0
-    do k = 1,nz
-       do j = 1,ny
-          do i = 1,nx+shift
-             global1(i,j,k) = k + i*1e-3 + j*1e-6
-          end do
-       end do
-       do j = 1,ny+shift
-          do i = 1,nx
-             global2(i,j,k) = k + i*1e-3 + j*1e-6
-          end do
-       end do
-    end do
-
-    if(type == 'Masked' .or. type == 'Masked symmetry') then
-       global1(nx-nx/layout(1)+1:nx+shift,ny-ny/layout(2)+1:ny,:) = 0
-       global2(nx-nx/layout(1)+1:nx,ny-ny/layout(2)+1:ny+shift,:) = 0
-    end if
-
-    select case (type)
-    case ('Folded-north', 'Masked')
-       !fill in folded north edge, cyclic east and west edge
-       call fill_folded_north_halo(global1, 1, 0, 0, 0, -1)
-       call fill_folded_north_halo(global2, 0, 1, 0, 0, -1)
-    case ('Folded xy_halo')
-       global1(1-xhalo:0,                   1:ny,:) =  global1(nx-xhalo+1:nx,                     1:ny,:)
-       global1(nx+1:nx+xhalo,               1:ny,:) =  global1(1:xhalo,                           1:ny,:)
-       global2(1-xhalo:0,                   1:ny,:) =  global2(nx-xhalo+1:nx,                     1:ny,:)
-       global2(nx+1:nx+xhalo,               1:ny,:) =  global2(1:xhalo,                           1:ny,:)
-       global1(1-xhalo:nx+xhalo-1, ny+1:ny+yhalo,:) = -global1(nx+xhalo-1:1-xhalo:-1, ny:ny-yhalo+1:-1,:)
-       global1(nx+xhalo,           ny+1:ny+yhalo,:) = -global1(nx-xhalo,              ny:ny-yhalo+1:-1,:)
-       global2(1-xhalo:nx+xhalo,   ny+1:ny+yhalo,:) = -global2(nx+xhalo:1-xhalo:-1,   ny-1:ny-yhalo:-1,:)
-    case ('Folded-north symmetry')
-       call fill_folded_north_halo(global1, 1, 0, 1, 0, -1)
-       call fill_folded_north_halo(global2, 0, 1, 0, 1, -1)
-    case ('Folded-south symmetry')
-       call fill_folded_south_halo(global1, 1, 0, 1, 0, -1)
-       call fill_folded_south_halo(global2, 0, 1, 0, 1, -1)
-    case ('Folded-west symmetry')
-       call fill_folded_west_halo(global1, 1, 0, 1, 0, -1)
-       call fill_folded_west_halo(global2, 0, 1, 0, 1, -1)
-    case ('Folded-east symmetry')
-       call fill_folded_east_halo(global1, 1, 0, 1, 0, -1)
-       call fill_folded_east_halo(global2, 0, 1, 0, 1, -1)
-    case default
-        call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type )
-    end select
-
-    x = 0.; y = 0.
-    x(is:ie+shift,js:je,      :) = global1(is:ie+shift,js:je,      :)
-    y(is:ie      ,js:je+shift,:) = global2(is:ie,      js:je+shift,:)
-    x1 = x; x2 = x; x3 = x; x4 = x
-    y1 = y; y2 = y; y3 = y; y4 = y
-
-    id = mpp_clock_id( type//' vector CGRID_NE', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
-    call mpp_clock_begin(id)
-    call mpp_update_domains( x,  y,  domain, gridtype=CGRID_NE)
-    call mpp_update_domains( x1, y1, domain, gridtype=CGRID_NE, complete=.false. )
-    call mpp_update_domains( x2, y2, domain, gridtype=CGRID_NE, complete=.false. )
-    call mpp_update_domains( x3, y3, domain, gridtype=CGRID_NE, complete=.false. )
-    call mpp_update_domains( x4, y4, domain, gridtype=CGRID_NE, complete=.true.  )
-    call mpp_clock_end  (id)
-
-    !redundant points must be equal and opposite
-    if(folded_south) then
-       global2(nx/2+1:nx,     1,:) = -global2(nx/2:1:-1, 1,:)
-       global2(1-whalo:0,     1,:) = -global2(nx-whalo+1:nx, 1, :)
-       global2(nx+1:nx+ehalo, 1,:) = -global2(1:ehalo,       1, :)
-    else if(folded_west) then
-       global1(1, ny/2+1:ny,     :) = -global1(1, ny/2:1:-1,     :)
-       global1(1, 1-shalo:0,     :) = -global1(1, ny-shalo+1:ny, :)
-       global1(1, ny+1:ny+nhalo, :) = -global1(1, 1:nhalo,       :)
-    else if(folded_east) then
-       global1(nx+shift, ny/2+1:ny,     :) = -global1(nx+shift, ny/2:1:-1,     :)
-       global1(nx+shift, 1-shalo:0,     :) = -global1(nx+shift, ny-shalo+1:ny, :)
-       global1(nx+shift, ny+1:ny+nhalo, :) = -global1(nx+shift, 1:nhalo,       :)
-    else
-       global2(nx/2+1:nx,     ny+shift,:) = -global2(nx/2:1:-1, ny+shift,:)
-       if(type == 'Folded xy_halo') then
-          global2(1-xhalo:0,     ny+shift,:) = -global2(nx-xhalo+1:nx, ny+shift,:)
-          global2(nx+1:nx+xhalo, ny+shift,:) = -global2(1:xhalo,       ny+shift,:)
-       else
-          global2(1-whalo:0,     ny+shift,:) = -global2(nx-whalo+1:nx, ny+shift,:)
-          global2(nx+1:nx+ehalo, ny+shift,:) = -global2(1:ehalo,       ny+shift,:)
-       end if
-    endif
-
-    call compare_checksums( x,  global1(isd:ied+shift,jsd:jed,      :), type//' CGRID_NE X' )
-    call compare_checksums( y,  global2(isd:ied,      jsd:jed+shift,:), type//' CGRID_NE Y' )
-    call compare_checksums( x1, global1(isd:ied+shift,jsd:jed,      :), type//' CGRID_NE X1' )
-    call compare_checksums( x2, global1(isd:ied+shift,jsd:jed,      :), type//' CGRID_NE X2' )
-    call compare_checksums( x3, global1(isd:ied+shift,jsd:jed,      :), type//' CGRID_NE X3' )
-    call compare_checksums( x4, global1(isd:ied+shift,jsd:jed,      :), type//' CGRID_NE X4' )
-    call compare_checksums( y1, global2(isd:ied,      jsd:jed+shift,:), type//' CGRID_NE Y1' )
-    call compare_checksums( y2, global2(isd:ied,      jsd:jed+shift,:), type//' CGRID_NE Y2' )
-    call compare_checksums( y3, global2(isd:ied,      jsd:jed+shift,:), type//' CGRID_NE Y3' )
-    call compare_checksums( y4, global2(isd:ied,      jsd:jed+shift,:), type//' CGRID_NE Y4' )
-
-    deallocate(global1, global2, x, x1, x2, x3, x4, y, y1, y2, y3, y4)
-
-
-  end subroutine test_halo_update
-
-  subroutine set_corner_zero( data, isd, ied, jsd, jed, isc, iec, jsc, jec )
-     integer,                               intent(in) :: isd, ied, jsd, jed
-     integer,                               intent(in) :: isc, iec, jsc, jec
-     real, dimension(isd:,jsd:,:), intent(inout) :: data
-
-    data (isd  :isc-1, jsd  :jsc-1,:) = 0
-    data (isd  :isc-1, jec+1:jed,  :) = 0
-    data (iec+1:ied  , jsd  :jsc-1,:) = 0
-    data (iec+1:ied  , jec+1:jed,  :) = 0
-
-
-  end subroutine set_corner_zero
-
-  !##################################################################################
-  subroutine test_update_edge( type )
-    character(len=*), intent(in) :: type
-    real, allocatable, dimension(:,:,:) :: x, x2, a
-    real, allocatable, dimension(:,:,:) :: y, y2, b
-    type(domain2D) :: domain
-    real,    allocatable :: global1(:,:,:), global2(:,:,:), global(:,:,:)
-    logical, allocatable :: maskmap(:,:)
-    integer              :: shift, i, xhalo, yhalo
-    logical              :: is_symmetry, folded_south, folded_west, folded_east
-    integer              :: is, ie, js, je, isd, ied, jsd, jed
-    integer              :: id_update
-
-    allocate(global(1-whalo:nx+ehalo,1-shalo:ny+nhalo,nz) )
-
-    global = 0
-    do k = 1,nz
-       do j = 1,ny
-          do i = 1,nx
-             global(i,j,k) = k + i*1e-3 + j*1e-6
-          end do
-       end do
-    end do
-
-    if(index(type, 'symmetry') == 0) then
-       is_symmetry = .false.
-    else
-       is_symmetry = .true.
-    end if
-    select case(type)
-    case( 'Cyclic' )
-        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,        &
-             shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN, &
-             name=type, symmetry = is_symmetry )
-        global(1-whalo:0,          1:ny,:) = global(nx-whalo+1:nx,             1:ny,:)
-        global(nx+1:nx+ehalo,      1:ny,:) = global(1:ehalo,                   1:ny,:)
-        global(1:nx,          1-shalo:0,:) = global(1:nx,             ny-shalo+1:ny,:)
-        global(1:nx,      ny+1:ny+nhalo,:) = global(1:nx,             1:nhalo,      :)
-    case( 'Folded-north', 'Folded-north symmetry' )
-        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
-             shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, &
-             name=type, symmetry = is_symmetry  )
-        call fill_folded_north_halo(global, 0, 0, 0, 0, 1)
-        !--- set the corner to 0
-        call set_corner_zero(global, 1-whalo, nx+ehalo, 1-shalo, ny+ehalo, 1, nx, 1, ny)
-    case default
-        call mpp_error( FATAL, 'test_update_edge: no such test: '//type )
-    end select
-
-!set up x array
-    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
-    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
-    allocate( x (isd:ied,jsd:jed,nz) )
-    allocate( a (isd:ied,jsd:jed,nz) )
-    allocate( x2 (isd:ied,jsd:jed,nz) )
-    x2 (isd:ied,jsd:jed,:) = global(isd:ied,jsd:jed,:)
-    call set_corner_zero(x2, isd, ied, jsd, jed, is, ie, js, je)
-
-    x = 0.
-    x (is:ie,js:je,:) = global(is:ie,js:je,:)
-
-!full update
-    id = mpp_clock_id( type, flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
-    call mpp_clock_begin(id)
-    call mpp_update_domains( x, domain, flags=EDGEUPDATE)
-    call mpp_clock_end  (id)
-    call compare_checksums( x, x2, type )
-    deallocate(x2)
-
-    a = 0
-    a(is:ie,js:je,:) = global(is:ie,js:je,:)
-    id_update = mpp_start_update_domains( a, domain, flags=EDGEUPDATE)
-    call mpp_complete_update_domains(id_update, a, domain, flags=EDGEUPDATE)
-    call compare_checksums( x, a, type//" nonblock")
-
-        !--- test vector update for FOLDED and MASKED case.
-    if( type == 'Cyclic' ) then
-       deallocate(global, x, a)
-       return
-    end if
-
-    !------------------------------------------------------------------
-    !              vector update : BGRID_NE
-    !------------------------------------------------------------------
-    shift = 0
-    if(is_symmetry) then
-       shift = 1
-       deallocate(global)
-       allocate(global(1-whalo:nx+ehalo+shift,1-shalo:ny+nhalo+shift,nz) )
-       global = 0.0
-       do k = 1,nz
-          do j = 1,ny+1
-             do i = 1,nx+1
-                global(i,j,k) = k + i*1e-3 + j*1e-6
-             end do
-          end do
-       end do
-       deallocate(x,a)
-       allocate( x (isd:ied+1,jsd:jed+1,nz) )
-       allocate( a (isd:ied+1,jsd:jed+1,nz) )
-    endif
-
-    select case (type)
-    case ('Folded-north')
-       !fill in folded north edge, cyclic east and west edge
-       call fill_folded_north_halo(global, 1, 1, 0, 0, -1)
-    case ('Folded-north symmetry')
-       call fill_folded_north_halo(global, 1, 1, 1, 1, -1)
-    case default
-        call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type )
-    end select
-
-    x = 0.
-    a = 0.
-    x(is:ie+shift,js:je+shift,:) = global(is:ie+shift,js:je+shift,:)
-    a(is:ie+shift,js:je+shift,:) = global(is:ie+shift,js:je+shift,:)
-    !set up y array
-    allocate( y (isd:ied+shift,jsd:jed+shift,nz) )
-    allocate( b (isd:ied+shift,jsd:jed+shift,nz) )
-    b = x
-    y = x
+    allocate( b (isd:ied+shift,jsd:jed+shift,nz) )
+    b = x
+    y = x
     id = mpp_clock_id( type//' vector BGRID_NE', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
     call mpp_clock_begin(id)
     call mpp_update_domains( x,  y, domain, flags=EDGEUPDATE, gridtype=BGRID_NE)
@@ -6644,25 +5194,28 @@ subroutine test_cyclic_offset( type )
 
   end subroutine test_cyclic_offset
 
-
-  subroutine test_global_field( type )
+  !--- test mpp_global_sum, mpp_global_min and mpp_global_max
+  subroutine test_global_reduce (type)
     character(len=*), intent(in) :: type
-    real, allocatable, dimension(:,:,:) :: x, gcheck
-    type(domain2D) :: domain
-    real, allocatable    :: global1(:,:,:)
-    integer              :: ishift, jshift, ni, nj, i, j, position
-    integer, allocatable :: pelist(:)
+    real    :: lsum, gsum, lmax, gmax, lmin, gmin
+    integer :: ni, nj, ishift, jshift, position
     integer              :: is, ie, js, je, isd, ied, jsd, jed
 
+    type(domain2D) :: domain
+    real, allocatable, dimension(:,:,:) :: global1, x
+    real, allocatable, dimension(:,:)   :: global2D
     !--- set up domain
     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
     select case(type)
-    case( 'Non-symmetry' )
+    case( 'Simple' )
            call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
                                     shalo=shalo, nhalo=nhalo, name=type )
-    case( 'Symmetry center', 'Symmetry corner', 'Symmetry east', 'Symmetry north' )
+    case( 'Simple symmetry center', 'Simple symmetry corner', 'Simple symmetry east', 'Simple symmetry north' )
            call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
                                     shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
+    case( 'Cyclic symmetry center', 'Cyclic symmetry corner', 'Cyclic symmetry east', 'Cyclic symmetry north' )
+           call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+                                    name=type, symmetry = .true., xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN )
     case default
         call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
     end select
@@ -6670,15 +5223,14 @@ subroutine test_global_field( type )
     call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
 
     !--- determine if an extra point is needed
-    ishift = 0; jshift = 0
-    position = CENTER
+    ishift = 0; jshift = 0; position = CENTER
     select case(type)
-    case ('Symmetry corner')
-       ishift = 1; jshift = 1; position=CORNER
-    case ('Symmetry east')
-       ishift = 1; jshift = 0; position=EAST
-    case ('Symmetry north')
-       ishift = 0; jshift = 1; position=NORTH
+    case ('Simple symmetry corner', 'Cyclic symmetry corner')
+       ishift = 1; jshift = 1; position = CORNER
+    case ('Simple symmetry east', 'Cyclic symmetry east' )
+       ishift = 1; jshift = 0; position = EAST
+    case ('Simple symmetry north', 'Cyclic symmetry north')
+       ishift = 0; jshift = 1; position = NORTH
     end select
 
     ie  = ie+ishift;  je  = je+jshift
@@ -6694,170 +5246,29 @@ subroutine test_global_field( type )
        end do
     enddo
 
-    allocate( gcheck(ni, nj, nz) )
+    !--- NOTE: even though the domain is cyclic, no need to apply cyclic condition on the global data
+
     allocate( x (isd:ied,jsd:jed,nz) )
+    allocate( global2D(ni,nj))
 
     x(:,:,:) = global1(isd:ied,jsd:jed,:)
+    do j = 1, nj
+       do i = 1, ni
+          global2D(i,j) = sum(global1(i,j,:))
+       enddo
+    enddo
+    !test mpp_global_sum
 
-    !--- test the data on data domain
-    gcheck = 0.
-    id = mpp_clock_id( type//' global field on data domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    if(type(1:6) == 'Simple') then
+       gsum = sum( global2D(1:ni,1:nj) )
+    else
+       gsum = sum( global2D(1:nx, 1:ny) )
+    endif
+    id = mpp_clock_id( type//' sum', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
     call mpp_clock_begin(id)
-    call mpp_global_field( domain, x, gcheck, position=position )
+    lsum = mpp_global_sum( domain, x, position = position  )
     call mpp_clock_end  (id)
-    !compare checksums between global and x arrays
-    call compare_checksums( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on data domain' )
-
-    !--- Since in the disjoint redistribute mpp test, pelist1 = (npes/2+1 .. npes-1)
-    !--- will be declared. But for the x-direction global field, mpp_sync_self will
-    !--- be called. For some pe count, pelist1 will be set ( only on pe of pelist1 )
-    !--- in the mpp_sync_self call, later when calling mpp_declare_pelist(pelist1),
-    !--- deadlock will happen. For example npes = 6 and layout = (2,3), pelist = (4,5)
-    !--- will be set in mpp_sync_self. To solve the problem, some explicit mpp_declare_pelist
-    !--- on all pe is needed for those partial pelist. But for y-update, it is ok.
-    !--- because the pelist in y-update is not continous.
-    allocate(pelist(0:layout(1)-1))
-    do j = 0, layout(2)-1
-       do i = 0, layout(1)-1
-          pelist(i) = j*layout(1) + i
-       end do
-       call mpp_declare_pelist(pelist)
-    end do
-    deallocate(pelist)
-
-    !xupdate
-    gcheck = 0.
-    call mpp_clock_begin(id)
-    call mpp_global_field( domain, x, gcheck, flags = XUPDATE, position=position )
-    call mpp_clock_end  (id)
-    !compare checksums between global and x arrays
-    call compare_checksums( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:), &
-                            type//' mpp_global_field xupdate only on data domain' )
-
-    !yupdate
-    gcheck = 0.
-    call mpp_clock_begin(id)
-    call mpp_global_field( domain, x, gcheck, flags = YUPDATE, position=position )
-    call mpp_clock_end  (id)
-    !compare checksums between global and x arrays
-    call compare_checksums( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:), &
-                            type//' mpp_global_field yupdate only on data domain' )
-
-    call mpp_clock_begin(id)
-    call mpp_global_field( domain, x, gcheck, position=position )
-
-    call mpp_clock_end  (id)
-    !compare checksums between global and x arrays
-    call compare_checksums( global1(1:ni,1:nj,:), gcheck, &
-                            type//' mpp_global_field on data domain' )
-
-    !--- test the data on compute domain
-    gcheck = 0.
-    id = mpp_clock_id( type//' global field on compute domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
-    call mpp_clock_begin(id)
-    call mpp_global_field( domain, x(is:ie, js:je, :), gcheck, position=position )
-    call mpp_clock_end  (id)
-    !compare checksums between global and x arrays
-    call compare_checksums( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on compute domain' )
-
-    !xupdate
-    gcheck = 0.
-    call mpp_clock_begin(id)
-    call mpp_global_field( domain, x(is:ie, js:je,:), gcheck, flags = XUPDATE, position=position )
-    call mpp_clock_end  (id)
-    !compare checksums between global and x arrays
-    call compare_checksums( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:), &
-                            type//' mpp_global_field xupdate only on compute domain' )
-
-    !yupdate
-    gcheck = 0.
-    call mpp_clock_begin(id)
-    call mpp_global_field( domain, x(is:ie, js:je,:), gcheck, flags = YUPDATE, position=position )
-    call mpp_clock_end  (id)
-    !compare checksums between global and x arrays
-    call compare_checksums( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:), &
-                            type//' mpp_global_field yupdate only on compute domain' )
-
-
-    deallocate(global1, gcheck, x)
-
-  end subroutine test_global_field
-
-    !--- test mpp_global_sum, mpp_global_min and mpp_global_max
-  subroutine test_global_reduce (type)
-    character(len=*), intent(in) :: type
-    real    :: lsum, gsum, lmax, gmax, lmin, gmin
-    integer :: ni, nj, ishift, jshift, position
-    integer              :: is, ie, js, je, isd, ied, jsd, jed
-
-    type(domain2D) :: domain
-    real, allocatable, dimension(:,:,:) :: global1, x
-    real, allocatable, dimension(:,:)   :: global2D
-    !--- set up domain
-    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-    select case(type)
-    case( 'Simple' )
-           call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
-                                    shalo=shalo, nhalo=nhalo, name=type )
-    case( 'Simple symmetry center', 'Simple symmetry corner', 'Simple symmetry east', 'Simple symmetry north' )
-           call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
-                                    shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
-    case( 'Cyclic symmetry center', 'Cyclic symmetry corner', 'Cyclic symmetry east', 'Cyclic symmetry north' )
-           call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
-                                    name=type, symmetry = .true., xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN )
-    case default
-        call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
-    end select
-    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
-    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
-
-    !--- determine if an extra point is needed
-    ishift = 0; jshift = 0; position = CENTER
-    select case(type)
-    case ('Simple symmetry corner', 'Cyclic symmetry corner')
-       ishift = 1; jshift = 1; position = CORNER
-    case ('Simple symmetry east', 'Cyclic symmetry east' )
-       ishift = 1; jshift = 0; position = EAST
-    case ('Simple symmetry north', 'Cyclic symmetry north')
-       ishift = 0; jshift = 1; position = NORTH
-    end select
-
-    ie  = ie+ishift;  je  = je+jshift
-    ied = ied+ishift; jed = jed+jshift
-    ni  = nx+ishift;  nj  = ny+jshift
-    allocate(global1(1-whalo:ni+ehalo, 1-shalo:nj+nhalo, nz))
-    global1 = 0.0
-    do k = 1,nz
-       do j = 1,nj
-          do i = 1,ni
-             global1(i,j,k) = k + i*1e-3 + j*1e-6
-          end do
-       end do
-    enddo
-
-    !--- NOTE: even though the domain is cyclic, no need to apply cyclic condition on the global data
-
-    allocate( x (isd:ied,jsd:jed,nz) )
-    allocate( global2D(ni,nj))
-
-    x(:,:,:) = global1(isd:ied,jsd:jed,:)
-    do j = 1, nj
-       do i = 1, ni
-          global2D(i,j) = sum(global1(i,j,:))
-       enddo
-    enddo
-    !test mpp_global_sum
-
-    if(type(1:6) == 'Simple') then
-       gsum = sum( global2D(1:ni,1:nj) )
-    else
-       gsum = sum( global2D(1:nx, 1:ny) )
-    endif
-    id = mpp_clock_id( type//' sum', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
-    call mpp_clock_begin(id)
-    lsum = mpp_global_sum( domain, x, position = position  )
-    call mpp_clock_end  (id)
-    if( pe.EQ.mpp_root_pe() )print '(a,2es15.8,a,es12.4)', type//' Fast sum=', lsum, gsum
+    if( pe.EQ.mpp_root_pe() )print '(a,2es15.8,a,es12.4)', type//' Fast sum=', lsum, gsum
 
     !test exact mpp_global_sum
     id = mpp_clock_id( type//' exact sum', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
@@ -6986,86 +5397,6 @@ subroutine test_modify_domain( )
 
   end subroutine test_modify_domain
 
-  subroutine compare_checksums( a, b, string )
-    real, intent(in), dimension(:,:,:) :: a, b
-    character(len=*), intent(in) :: string
-    integer(LONG_KIND) :: sum1, sum2
-    integer :: i, j, k
-
-    ! z1l can not call mpp_sync here since there might be different number of tiles on each pe.
-    call mpp_sync_self()
-
-    if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) .or. size(a,3) .ne. size(b,3) ) &
-         call mpp_error(FATAL,'compare_chksum: size of a and b does not match')
-
-    do k = 1, size(a,3)
-       do j = 1, size(a,2)
-          do i = 1, size(a,1)
-             if(a(i,j,k) .ne. b(i,j,k)) then
-                write(*,'(a,i3,a,i3,a,i3,a,i3,a,f20.9,a,f20.9)') trim(string)//" at pe ", mpp_pe(), &
-                     ", at point (",i,", ", j, ", ", k, "), a = ", a(i,j,k), ", b = ", b(i,j,k)
-                call mpp_error(FATAL, trim(string)//': point by point comparison are not OK.')
-             endif
-          enddo
-       enddo
-    enddo
-
-    sum1 = mpp_chksum( a, (/pe/) )
-    sum2 = mpp_chksum( b, (/pe/) )
-
-    if( sum1.EQ.sum2 )then
-        if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(string)//': OK.' )
-        !--- in some case, even though checksum agree, the two arrays
-        !    actually are different, like comparing (1.1,-1.2) with (-1.1,1.2)
-        !--- hence we need to check the value point by point.
-    else
-        write(stdunit, *)"sum1 =", sum1, mpp_pe()
-        write(stdunit, *)"sum2 =", sum2, mpp_pe()
-        write(stdunit,'(a,i3,a,i20,a,i20)')" at pe ", mpp_pe(), " sum(a)=", sum1, " sum(b)=", sum2
-        call mpp_error( FATAL, trim(string)//': chksums are not OK.' )
-    end if
-  end subroutine compare_checksums
-
-  !###########################################################################
-  subroutine compare_checksums_2D( a, b, string )
-    real, intent(in), dimension(:,:) :: a, b
-    character(len=*), intent(in) :: string
-    integer(LONG_KIND) :: sum1, sum2
-    integer :: i, j
-
-    ! z1l can not call mpp_sync here since there might be different number of tiles on each pe.
-    ! mpp_sync()
-    call mpp_sync_self()
-
-    if(size(a,1) .ne. size(b,1) .or. size(a,2) .ne. size(b,2) ) &
-         call mpp_error(FATAL,'compare_chksum_2D: size of a and b does not match')
-
-    do j = 1, size(a,2)
-       do i = 1, size(a,1)
-          if(a(i,j) .ne. b(i,j)) then
-            print*, "a =", a(i,j)
-            print*, "b =", b(i,j)
-             write(*,'(a,i3,a,i3,a,i3,a,f20.9,a,f20.9)')"at the pe ", mpp_pe(), &
-                  ", at point (",i,", ", j, "),a=", a(i,j), ",b=", b(i,j)
-             call mpp_error(FATAL, trim(string)//': point by point comparison are not OK.')
-          endif
-       enddo
-    enddo
-
-    sum1 = mpp_chksum( a, (/pe/) )
-    sum2 = mpp_chksum( b, (/pe/) )
-
-    if( sum1.EQ.sum2 )then
-        if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(string)//': OK.' )
-        !--- in some case, even though checksum agree, the two arrays
-        !    actually are different, like comparing (1.1,-1.2) with (-1.1,1.2)
-        !--- hence we need to check the value point by point.
-    else
-        call mpp_error( FATAL, trim(string)//': chksums are not OK.' )
-    end if
-  end subroutine compare_checksums_2D
-
-
   !###########################################################################
 
   subroutine compare_data_scalar( a, b, action, string )
@@ -7629,62 +5960,1846 @@ subroutine get_nnest2(domain, num_nest, tile_coarse, istart_coarse, iend_coarse,
                 call mpp_error(FATAL, "get_nnest2: rotate should be 0 or -90 when ie>ieg")
              endif
           endif
-       enddo
-    enddo
-
-  end subroutine get_nnest2
-
-!###############################################################################
-  subroutine fill_nest_data(buffer, is, ie, js, je, nnest, tile, ishift, jshift, iadd, jadd, rotate, &
-                            isl, iel, jsl, jel, xadd, yadd, sign1, sign2)
-     real, dimension(is:,js:,:), intent(inout) :: buffer
-     integer,                       intent(in) :: is, ie, js, je, nnest
-     integer,                       intent(in) :: ishift, jshift
-     integer, dimension(:),         intent(in) :: tile, iadd, jadd, rotate, isl, iel, jsl, jel
-     real,                          intent(in) :: xadd, yadd
-     integer,                       intent(in) :: sign1, sign2
-     integer :: i, j, k, n, nk
-     integer :: ioff, joff
-
-     ioff = 0
-     joff = 0
-     nk = size(buffer,3)
-     do k = 1, nk
-        do n = 1, nnest
-           if(iel(n) == ie) ioff = ishift
-           if(jel(n) == je) joff = jshift
-
-           select case (rotate(n))
-           case(ZERO)
-              do j = jsl(n), jel(n)+joff
-                 do i = isl(n), iel(n)+ioff
-                    buffer(i,j,k) = xadd + tile(n) + (i-iadd(n))*1.e-3 + (j-jadd(n))*1.e-6 + k*1.e-9
-                 enddo
-              enddo
-           case (NINETY)
-              do j = jsl(n), jel(n)+joff
-                 do i = isl(n), iel(n)+ioff
-                    buffer(i,j,k) = sign2*(yadd + tile(n) + (j-jadd(n))*1.e-3 + (nx-i+iadd(n)+1+ioff)*1.e-6 + k*1.e-9)
-                 enddo
-              enddo
-           case (MINUS_NINETY)
-              do j = jsl(n), jel(n)+joff
-                 do i = isl(n), iel(n)+ioff
-                    buffer(i,j,k) = sign1*(yadd + tile(n) + (ny-j+jadd(n)+1+joff)*1.e-3 + (i-iadd(n))*1.e-6 + k*1.e-9)
-                 enddo
-              enddo
-           case default
-              call mpp_error(FATAL,"fill_nest_data: rotate must be ZERO, NINETY, MINUS_NINETY")
-           end select
-        enddo
-     enddo
+       enddo
+    enddo
+
+  end subroutine get_nnest2
+
+!###############################################################################
+
+  subroutine test_update_nest_domain_r8( type )
+    character(len=*), intent(in) :: type
+    logical                      :: cubic_grid
+    logical                      :: is_fine_pe, is_coarse_pe
+    integer                      :: n, i, j, k
+    integer                      :: ntiles, npes_per_tile
+    integer                      :: npes_fine, pos
+    integer                      :: isc_coarse, iec_coarse, jsc_coarse, jec_coarse
+    integer                      :: isd_coarse, ied_coarse, jsd_coarse, jed_coarse
+    integer                      :: isd_fine, ied_fine, jsd_fine, jed_fine
+    integer                      :: isc_fine, iec_fine, jsc_fine, jec_fine
+    integer                      :: nx_fine, ny_fine, nx_coarse, ny_coarse
+    integer                      :: nxc_fine, nyc_fine, nxc_coarse, nyc_coarse
+    integer                      :: isw_f, iew_f, jsw_f, jew_f, isw_c, iew_c, jsw_c, jew_c
+    integer                      :: ise_f, iee_f, jse_f, jee_f, ise_c, iee_c, jse_c, jee_c
+    integer                      :: iss_f, ies_f, jss_f, jes_f, iss_c, ies_c, jss_c, jes_c
+    integer                      :: isn_f, ien_f, jsn_f, jen_f, isn_c, ien_c, jsn_c, jen_c
+    integer                      :: isw_fx, iew_fx, jsw_fx, jew_fx, isw_cx, iew_cx, jsw_cx, jew_cx
+    integer                      :: ise_fx, iee_fx, jse_fx, jee_fx, ise_cx, iee_cx, jse_cx, jee_cx
+    integer                      :: iss_fx, ies_fx, jss_fx, jes_fx, iss_cx, ies_cx, jss_cx, jes_cx
+    integer                      :: isn_fx, ien_fx, jsn_fx, jen_fx, isn_cx, ien_cx, jsn_cx, jen_cx
+    integer                      :: isw_fy, iew_fy, jsw_fy, jew_fy, isw_cy, iew_cy, jsw_cy, jew_cy
+    integer                      :: ise_fy, iee_fy, jse_fy, jee_fy, ise_cy, iee_cy, jse_cy, jee_cy
+    integer                      :: iss_fy, ies_fy, jss_fy, jes_fy, iss_cy, ies_cy, jss_cy, jes_cy
+    integer                      :: isn_fy, ien_fy, jsn_fy, jen_fy, isn_cy, ien_cy, jsn_cy, jen_cy
+    integer                      :: isw_f2, iew_f2, jsw_f2, jew_f2, isw_c2, iew_c2, jsw_c2, jew_c2, tile_w2
+    integer                      :: ise_f2, iee_f2, jse_f2, jee_f2, ise_c2, iee_c2, jse_c2, jee_c2, tile_e2
+    integer                      :: iss_f2, ies_f2, jss_f2, jes_f2, iss_c2, ies_c2, jss_c2, jes_c2, tile_s2
+    integer                      :: isn_f2, ien_f2, jsn_f2, jen_f2, isn_c2, ien_c2, jsn_c2, jen_c2, tile_n2
+    integer                      :: isw_fx2, iew_fx2, jsw_fx2, jew_fx2, isw_cx2, iew_cx2, jsw_cx2, jew_cx2, tile_wx2
+    integer                      :: ise_fx2, iee_fx2, jse_fx2, jee_fx2, ise_cx2, iee_cx2, jse_cx2, jee_cx2, tile_ex2
+    integer                      :: iss_fx2, ies_fx2, jss_fx2, jes_fx2, iss_cx2, ies_cx2, jss_cx2, jes_cx2, tile_sx2
+    integer                      :: isn_fx2, ien_fx2, jsn_fx2, jen_fx2, isn_cx2, ien_cx2, jsn_cx2, jen_cx2, tile_nx2
+    integer                      :: isw_fy2, iew_fy2, jsw_fy2, jew_fy2, isw_cy2, iew_cy2, jsw_cy2, jew_cy2, tile_wy2
+    integer                      :: ise_fy2, iee_fy2, jse_fy2, jee_fy2, ise_cy2, iee_cy2, jse_cy2, jee_cy2, tile_ey2
+    integer                      :: iss_fy2, ies_fy2, jss_fy2, jes_fy2, iss_cy2, ies_cy2, jss_cy2, jes_cy2, tile_sy2
+    integer                      :: isn_fy2, ien_fy2, jsn_fy2, jen_fy2, isn_cy2, ien_cy2, jsn_cy2, jen_cy2, tile_ny2
+    integer                      :: isw_f_T, iew_f_T, jsw_f_T, jew_f_T, isw_c_T, iew_c_T, jsw_c_T, jew_c_T
+    integer                      :: ise_f_T, iee_f_T, jse_f_T, jee_f_T, ise_c_T, iee_c_T, jse_c_T, jee_c_T
+    integer                      :: iss_f_T, ies_f_T, jss_f_T, jes_f_T, iss_c_T, ies_c_T, jss_c_T, jes_c_T
+    integer                      :: isn_f_T, ien_f_T, jsn_f_T, jen_f_T, isn_c_T, ien_c_T, jsn_c_T, jen_c_T
+    integer                      :: is_c, ie_c, js_c, je_c, is_f, ie_f, js_f, je_f
+    integer                      :: is_cx, ie_cx, js_cx, je_cx, is_fx, ie_fx, js_fx, je_fx
+    integer                      :: is_cy, ie_cy, js_cy, je_cy, is_fy, ie_fy, js_fy, je_fy
+    integer                      :: tile, position, shift
+    integer                      :: layout_fine(2), my_fine_id
+    integer, allocatable         :: pelist(:), start_pos(:), end_pos(:)
+    integer, allocatable         :: my_pelist_fine(:)
+    integer, allocatable         :: pe_start(:), pe_end(:)
+    integer, allocatable         :: layout2D(:,:), global_indices(:,:)
+    real(kind=r8_kind), allocatable :: x(:,:,:), x1(:,:,:), x2(:,:,:)
+    real(kind=r8_kind), allocatable :: y(:,:,:), y1(:,:,:), y2(:,:,:)
+    real(kind=r8_kind), allocatable :: wbuffer(:,:,:), wbuffer2(:,:,:)
+    real(kind=r8_kind), allocatable :: ebuffer(:,:,:), ebuffer2(:,:,:)
+    real(kind=r8_kind), allocatable :: sbuffer(:,:,:), sbuffer2(:,:,:)
+    real(kind=r8_kind), allocatable :: nbuffer(:,:,:), nbuffer2(:,:,:)
+    real(kind=r8_kind), allocatable :: wbufferx(:,:,:), wbufferx2(:,:,:)
+    real(kind=r8_kind), allocatable :: ebufferx(:,:,:), ebufferx2(:,:,:)
+    real(kind=r8_kind), allocatable :: sbufferx(:,:,:), sbufferx2(:,:,:)
+    real(kind=r8_kind), allocatable :: nbufferx(:,:,:), nbufferx2(:,:,:)
+    real(kind=r8_kind), allocatable :: wbuffery(:,:,:), wbuffery2(:,:,:)
+    real(kind=r8_kind), allocatable :: ebuffery(:,:,:), ebuffery2(:,:,:)
+    real(kind=r8_kind), allocatable :: sbuffery(:,:,:), sbuffery2(:,:,:)
+    real(kind=r8_kind), allocatable :: nbuffery(:,:,:), nbuffery2(:,:,:)
+    integer                      :: x_refine(num_nest), y_refine(num_nest)
+    integer                      :: istart_fine(num_nest), iend_fine(num_nest)
+    integer                      :: jstart_fine(num_nest), jend_fine(num_nest)
+    integer                      :: iend_coarse(num_nest), jend_coarse(num_nest)
+    integer                      :: is_fine(6*num_nest), ie_fine(6*num_nest)
+    integer                      :: js_fine(6*num_nest), je_fine(6*num_nest)
+    integer                      :: is_coarse(6*num_nest), ie_coarse(6*num_nest)
+    integer                      :: js_coarse(6*num_nest), je_coarse(6*num_nest)
+    integer                      :: t_coarse(6*num_nest), rotate_coarse(6*num_nest)
+    integer                      :: iadd_coarse(6*num_nest), jadd_coarse(6*num_nest)
+    integer                      :: nnest
+    character(len=128)           :: type2
+    character(len=32)            :: text, pelist_name
+    type(domain2d)               :: domain
+    type(domain2d), pointer      :: domain_coarse=>NULL()
+    type(domain2d), pointer      :: domain_fine=>NULL()
+    type(nest_domain_type)       :: nest_domain
+    logical                      :: x_cyclic, y_cyclic
+    integer                      :: my_tile_id(1), my_num_nest
+    integer, dimension(num_nest) :: my_tile_coarse, my_tile_fine, my_istart_coarse, my_iend_coarse
+    integer, dimension(num_nest) :: my_jstart_coarse, my_jend_coarse
+    integer                      :: ntiles_nest_top, npes_nest_top, num_nest_level, my_npes, l
+    integer                      :: npes_my_fine, npes_my_level
+    integer, allocatable         :: my_pelist(:)
+
+    x_cyclic = .false.
+    y_cyclic = .false.
+    if(cyclic_nest(1) == 'X') then
+       x_cyclic = .true.
+    else if(cyclic_nest(1) == 'Y') then
+       y_cyclic = .true.
+    endif
+
+    istart_fine = 0; iend_fine = -1
+    jstart_fine = 0; jend_fine = -1
+    iend_coarse = -1; jend_coarse = -1
+    is_fine = 0;  ie_fine = -1
+    js_fine = 0;  je_fine = -1
+    is_coarse = 0;  ie_coarse = -1
+    js_coarse = 0;  je_coarse = -1
+    t_coarse = 0; rotate_coarse = -1;
+    iadd_coarse = 0; jadd_coarse = 0
+
+    select case(type)
+    case ( 'Cubic-Grid' )
+       if( nx_cubic == 0 ) then
+          call mpp_error(NOTE,'test_update_nest_domain: for Cubic_grid mosaic, nx_cubic is zero, '//&
+                  'No test is done for Cubic-Grid mosaic. ' )
+          return
+       endif
+       if( nx_cubic .NE. ny_cubic ) then
+          call mpp_error(NOTE,'test_update_nest_domain: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
+                  'No test is done for Cubic-Grid mosaic. ' )
+          return
+       endif
+       nx = nx_cubic
+       ny = ny_cubic
+       ntiles_nest_top = 6
+       cubic_grid = .true.
+    case default
+       call mpp_error(FATAL, 'test_update_nest_domain: no such test: '//type)
+    end select
+
+    if(ntiles_nest_all > MAX_NTILE) call mpp_error(FATAL, 'test_update_nest_domain: ntiles_nest_all > MAX_NTILE')
+    if(ntiles_nest_top .GE. ntiles_nest_all) call mpp_error(FATAL, 'test_update_nest_domain: ntiles_nest_top .GE. ntile_nest_all')
+    if(ntiles_nest_all .NE. ntiles_nest_top + num_nest) call mpp_error(FATAL, &
+             'test_update_nest_domain: ntiles_nest_all .NE. ntiles_nest_top + num_nest')
+    !--- for the ntiles_nest_top, number of processors should be same
+    do n = 1, ntiles_nest_all
+       if(npes_nest_tile(n) .LE. 0) call mpp_error(FATAL, &
+            'test_update_nest_domain: npes_nest_tile is not properly set')
+    enddo
+    do n = 2, ntiles_nest_top
+       if(npes_nest_tile(n) .NE. npes_nest_tile(n-1)) call mpp_error(FATAL, &
+            'test_update_nest_domain: each tile of top mosaic grid should use same number of MPI ranks')
+    enddo
+    npes_nest_top = ntiles_nest_top * npes_nest_tile(1)
+
+    npes = mpp_npes()
+
+    !--- make sure sum(npes_nest_tile) == npes
+    if(sum(npes_nest_tile(1:ntiles_nest_all)) .NE. npes ) &
+         call mpp_error(FATAL, "test_mpp_domains: sum(npes_nest_tile) .NE. npes")
+
+    !--- make sure tile_fine are monotonically increasing and equal to ntiles_nest_top + nest number
+    do n = 1, num_nest
+       if(tile_fine(n) .NE. ntiles_nest_top+n) call mpp_error(FATAL, &
+           "test_mpp_domains: tile_fine(n) .NE. ntiles_nest_top+n")
+    enddo
+
+    !---make sure nest_level is setup properly
+    if(nest_level(1) .NE. 1) call mpp_error(FATAL, "test_mpp_domains: nest_level(1) .NE. 1")
+    do n = 2, num_nest
+       if(nest_level(n) > nest_level(n-1)+1) call mpp_error(FATAL, "test_mpp_domains: nest_level(n) > nest_level(n-1)+1")
+       if(nest_level(n) < nest_level(n-1) ) call mpp_error(FATAL, "test_mpp_domains: nest_level(n) < nest_level(n-1)")
+    enddo
+    num_nest_level = nest_level(num_nest)
+
+    allocate(pelist(npes))
+    call mpp_get_current_pelist(pelist)
+
+    !--- compute iend_coarse and jend_coarse
+    do n = 1, num_nest
+       iend_coarse(n) = istart_coarse(n) + icount_coarse(n) - 1
+       jend_coarse(n) = jstart_coarse(n) + jcount_coarse(n) - 1
+       istart_fine(n) = 1; iend_fine(n) = icount_coarse(n)*refine_ratio(n)
+       jstart_fine(n) = 1; jend_fine(n) = jcount_coarse(n)*refine_ratio(n)
+    enddo
+
+    !--- first define the top level grid mosaic domain.
+
+    !--- setup pelist for top level
+    allocate(my_pelist(npes_nest_top))
+    do n = 1, npes_nest_top
+       my_pelist(n) = pelist(n)
+    enddo
+    call mpp_declare_pelist(my_pelist)
+    if(ANY(my_pelist==mpp_pe())) then
+       call mpp_set_current_pelist(my_pelist)
+
+       allocate(layout2D(2,ntiles_nest_top), global_indices(4,ntiles_nest_top), pe_start(ntiles_nest_top), pe_end(ntiles_nest_top) )
+       npes_per_tile = npes_nest_tile(1)
+
+       call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+       do n = 1, ntiles_nest_top
+          global_indices(:,n) = (/1,nx,1,ny/)
+          layout2D(:,n)         = layout
+       end do
+       do n = 1, ntiles_nest_top
+          pe_start(n) = (n-1)*npes_per_tile
+          pe_end(n)   = n*npes_per_tile-1
+       end do
+
+       if( cubic_grid ) then
+          call define_cubic_mosaic(type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
+                                   global_indices, layout2D, pe_start, pe_end )
+       endif
+       call mpp_get_compute_domain(domain, isc_coarse, iec_coarse, jsc_coarse, jec_coarse)
+       call mpp_get_data_domain(domain, isd_coarse, ied_coarse, jsd_coarse, jed_coarse)
+       deallocate(layout2D, global_indices, pe_start, pe_end )
+    endif
+
+    call mpp_set_current_pelist()
+    deallocate(my_pelist)
+    !--- define domain for all the nest regoin.
+    pos = npes_nest_top
+    do n = 1, num_nest
+       my_npes = npes_nest_tile(tile_fine(n))
+       allocate(my_pelist(my_npes))
+       my_pelist(:) = pelist(pos+1:pos+my_npes)
+       call mpp_declare_pelist(my_pelist)
+       if(ANY(my_pelist==mpp_pe())) then
+          call mpp_set_current_pelist(my_pelist)
+          nx_fine = iend_fine(n) - istart_fine(n) + 1
+          ny_fine = jend_fine(n) - jstart_fine(n) + 1
+          call mpp_define_layout( (/1,nx_fine,1,ny_fine/), my_npes, layout )
+          call mpp_define_domains((/1,nx_fine,1,ny_fine/), layout, domain, &
+                          whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+                          symmetry=.true., name=trim(type)//' fine grid', tile_id = tile_fine(n) )
+          call mpp_get_compute_domain(domain, isc_fine, iec_fine, jsc_fine, jec_fine)
+          call mpp_get_data_domain(domain, isd_fine, ied_fine, jsd_fine, jed_fine)
+          !--- test halo update for nested region.
+          call test_nest_halo_update(domain)
+       endif
+       pos = pos+my_npes
+       deallocate(my_pelist)
+       call mpp_set_current_pelist()
+    enddo
+
+    !--- reset to the global pelist
+    call mpp_set_current_pelist()
+
+    x_refine(:) = refine_ratio(1:num_nest)
+    y_refine(:) = refine_ratio(1:num_nest)
+
+    call mpp_define_nest_domains(nest_domain, domain, num_nest, nest_level(1:num_nest), tile_fine(1:num_nest), &
+             tile_coarse(1:num_nest), istart_coarse(1:num_nest), icount_coarse(1:num_nest), jstart_coarse(1:num_nest), &
+             jcount_coarse(1:num_nest), npes_nest_tile(1:ntiles_nest_all), &
+             x_refine(1:num_nest), y_refine(1:num_nest), extra_halo=extra_halo, name="nest_domain")
+
+    !--- loop over nest level
+    do l = 1, num_nest_level
+       npes_my_level = mpp_get_nest_npes(nest_domain, l)
+       npes_my_fine = mpp_get_nest_fine_npes(nest_domain,l)
+       allocate(my_pelist(npes_my_level))
+       allocate(my_pelist_fine(npes_my_fine))
+       call mpp_get_nest_pelist(nest_domain, l, my_pelist)
+       call mpp_get_nest_fine_pelist(nest_domain, l, my_pelist_fine)
+
+       call mpp_declare_pelist(my_pelist(:))
+       write(type2, '(a,I2)')trim(type)//" nest_level = ",l
+       if(ANY(my_pelist(:)==mpp_pe())) then
+          call mpp_set_current_pelist(my_pelist)
+          my_tile_id = mpp_get_tile_id(domain)
+          domain_coarse => mpp_get_nest_coarse_domain(nest_domain, nest_level=l)
+          domain_fine => mpp_get_nest_fine_domain(nest_domain, nest_level=l)
+          is_fine_pe = mpp_is_nest_fine(nest_domain, l)
+          is_coarse_pe = mpp_is_nest_coarse(nest_domain, l)
+          if(is_fine_pe .eqv. is_coarse_pe) call mpp_error(FATAL, "test_mpp_domains: is_fine_pe .eqv. is_coarse_pe")
+          my_num_nest = 0
+          my_fine_id = 0
+          do n = 1, num_nest
+             if(nest_level(n)==l) then
+                my_num_nest = my_num_nest+1
+                my_tile_coarse(my_num_nest) = tile_coarse(n)
+                my_tile_fine(my_num_nest) = tile_fine(n)
+                my_istart_coarse(my_num_nest) = istart_coarse(n)
+                my_iend_coarse(my_num_nest) = iend_coarse(n)
+                my_jstart_coarse(my_num_nest) = jstart_coarse(n)
+                my_jend_coarse(my_num_nest) = jend_coarse(n)
+                if(my_tile_id(1) == tile_fine(n)) my_fine_id = n
+             endif
+          enddo
+          !--- each nest region might be over multiple face of cubic sphere grid.
+          !---Get the number of nest region with consideration of face.
+          call get_nnest(domain_coarse, my_num_nest, my_tile_coarse, my_istart_coarse, my_iend_coarse, &
+               my_jstart_coarse, my_jend_coarse, nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, &
+               is_coarse, ie_coarse, js_coarse, je_coarse)
+
+          !---------------------------------------------------------------------------
+          !
+          !                    fine to coarse scalar field, limit to position=CENTER.
+          !
+          !---------------------------------------------------------------------------
+          if(is_fine_pe) then
+             call mpp_get_compute_domain(domain_fine, isc_fine, iec_fine, jsc_fine, jec_fine)
+             call mpp_get_data_domain(domain_fine, isd_fine, ied_fine, jsd_fine, jed_fine)
+          endif
+
+          if(is_coarse_pe) then
+             call mpp_get_compute_domain(domain_coarse, isc_coarse, iec_coarse, jsc_coarse, jec_coarse)
+             call mpp_get_data_domain(domain_coarse, isd_coarse, ied_coarse, jsd_coarse, jed_coarse)
+          endif
+
+          if(is_fine_pe) then
+             call mpp_get_F2C_index(nest_domain, is_c, ie_c, js_c, je_c, is_f, ie_f, js_f, je_f, l, position=CENTER)
+             allocate(x(is_c:ie_c, js_c:je_c, nz))
+             x = 0
+             do k = 1, nz
+                do j = js_c, je_c
+                   do i = is_c, ie_c
+                      x(i,j,k) = i*1.e+6 + j*1.e+3 + k + 0.001
+                   enddo
+                enddo
+             enddo
+          else
+             allocate(x1(isd_coarse:ied_coarse, jsd_coarse:jed_coarse, nz))
+             allocate(x2(isd_coarse:ied_coarse, jsd_coarse:jed_coarse, nz))
+             x1 = 0
+             tile = my_tile_id(1)
+
+             do k = 1, nz
+                do j = jsc_coarse, jec_coarse
+                   do i = isc_coarse, iec_coarse
+                      x1(i,j,k) = i*1.e+6 + j*1.e+3 + k + 0.002
+                   enddo
+                enddo
+             enddo
+             x2 = x1
+          endif
+
+
+          if(is_coarse_pe) then
+             do n = 1, nnest
+                is_c = max(is_coarse(n), isc_coarse)
+                ie_c = min(ie_coarse(n),   iec_coarse)
+                js_c = max(js_coarse(n), jsc_coarse)
+                je_c = min(je_coarse(n),   jec_coarse)
+                if( tile == t_coarse(n) .AND. ie_c .GE. is_c .AND. je_c .GE. js_c ) then
+                   call fill_coarse_data(x2, rotate_coarse(n), iadd_coarse(n), jadd_coarse(n), &
+                        is_c, ie_c, js_c, je_c, nz, isd_coarse, jsd_coarse, nx, ny, 0, 0, 0.001, 0.001, 1, 1, &
+                        .false., .false., iend_coarse(1), jend_coarse(1) )
+                endif
+             enddo
+          endif
+
+          call mpp_update_nest_coarse(x, nest_domain, x1, nest_level=l, position=CENTER)
+
+          !--- compare with assumed value
+          if( is_coarse_pe) then
+             call compare_checksums(x1, x2, trim(type2)//' fine to coarse scalar')
+          endif
+          if(allocated(x))       deallocate(x)
+          if(allocated(x1))      deallocate(x1)
+          if(allocated(x2))      deallocate(x2)
+       !---------------------------------------------------------------------------
+       !
+       !                    fine to coarse CGRID scalar pair update
+       !
+       !---------------------------------------------------------------------------
+       shift = 1
+
+       if(is_fine_pe) then
+          call mpp_get_F2C_index(nest_domain, is_cx, ie_cx, js_cx, je_cx, is_fx, ie_fx, js_fx, je_fx, l, position=EAST)
+          call mpp_get_F2C_index(nest_domain, is_cy, ie_cy, js_cy, je_cy, is_fy, ie_fy, js_fy, je_fy, l, position=NORTH)
+          allocate(x(is_cx:ie_cx, js_cx:je_cx, nz))
+          allocate(y(is_cy:ie_cy, js_cy:je_cy, nz))
+          x = 0
+          y = 0
+          do k = 1, nz
+             do j = js_cx, je_cx
+                do i = is_cx, ie_cx
+                   x(i,j,k) = i*1.e+6 + j*1.e+3 + k + 1.0E-6
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = js_cy, je_cy
+                do i = is_cy, ie_cy
+                   y(i,j,k) = i*1.e+6 + j*1.e+3 + k + 2.0E-6
+                enddo
+             enddo
+          enddo
+          if(x_cyclic) then
+             if(ie_cx == iend_coarse(1)+1) then
+                i = ie_cx
+                do k = 1, nz
+                   do j = js_cx, je_cx
+                      x(i,j,k) = istart_coarse(1)*1.e+6 + j*1.e+3 + k + 1.0E-6
+                   enddo
+                enddo
+             endif
+          endif
+          if(y_cyclic) then
+             if(je_cx == jend_coarse(1)+1) then
+                j = je_cx
+                do k = 1, nz
+                   do i = is_cx, ie_cx
+                      y(i,j,k) = i*1.e+6 + jstart_coarse(1)*1.e+3 + k + 1.0E-6
+                   enddo
+                enddo
+             endif
+          endif
+       else
+          allocate(x1(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse, nz))
+          allocate(x2(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse, nz))
+          allocate(y1(isd_coarse:ied_coarse, jsd_coarse:jed_coarse+shift, nz))
+          allocate(y2(isd_coarse:ied_coarse, jsd_coarse:jed_coarse+shift, nz))
+          x1 = 0
+          tile = my_tile_id(1)
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse
+                do i = isc_coarse, iec_coarse+shift
+                   x1(i,j,k) = i*1.e+6 + j*1.e+3 + k + 0.001
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse+shift
+                do i = isc_coarse, iec_coarse
+                   y1(i,j,k) = i*1.e+6 + j*1.e+3 + k + 0.002
+                enddo
+             enddo
+          enddo
+          x2 = x1
+          y2 = y1
+       endif
+
+
+       if(is_coarse_pe) then
+          do n = 1, nnest
+             is_c = max(is_coarse(n), isc_coarse)
+             ie_c = min(ie_coarse(n),   iec_coarse)
+             js_c = max(js_coarse(n), jsc_coarse)
+             je_c = min(je_coarse(n),   jec_coarse)
+             if( tile == t_coarse(n) .AND. ie_c+shift .GE. is_c .AND. je_c .GE. js_c ) then
+                call fill_coarse_data(x2, rotate_coarse(n), iadd_coarse(n), jadd_coarse(n), &
+                     is_c, ie_c, js_c, je_c, nz, isd_coarse, jsd_coarse, nx, ny, shift, 0, 1.0E-6, 2.0E-6, 1, 1, &
+                     x_cyclic, .false., iend_coarse(1)+1, jend_coarse(1)+1)
+             endif
+             if( tile == t_coarse(n) .AND. ie_c .GE. is_c .AND. je_c+shift .GE. js_c ) then
+                call fill_coarse_data(y2, rotate_coarse(n), iadd_coarse(n), jadd_coarse(n), &
+                     is_c, ie_c, js_c, je_c, nz, isd_coarse, jsd_coarse, nx, ny, 0, shift, 2.0E-6, 1.0E-6, 1, 1, &
+                     .false., y_cyclic, iend_coarse(1)+1, jend_coarse(1)+1)
+             endif
+          enddo
+       endif
+
+       call mpp_update_nest_coarse(x, y, nest_domain, x1, y1, nest_level=l, gridtype=CGRID_NE, flags=SCALAR_PAIR)
+
+       !--- compare with assumed value
+       if( is_coarse_pe) then
+          call compare_checksums(x1, x2, trim(type2)//' fine to coarse buffer CGRID Scalar_pair X')
+          call compare_checksums(x1, x2, trim(type2)//' fine to coarse buffer CGRID Scalar_pair Y')
+       endif
+       if(allocated(x))       deallocate(x)
+       if(allocated(x1))      deallocate(x1)
+       if(allocated(x2))      deallocate(x2)
+       if(allocated(y))       deallocate(y)
+       if(allocated(y1))      deallocate(y1)
+       if(allocated(y2))      deallocate(y2)
+
+       !---------------------------------------------------------------------------
+       !
+       !                    fine to coarse CGRID vector update
+       !
+       !---------------------------------------------------------------------------
+       shift = 1
+
+       if(is_fine_pe) then
+          call mpp_get_F2C_index(nest_domain, is_cx, ie_cx, js_cx, je_cx, is_fx, ie_fx, js_fx, je_fx, l, position=EAST)
+          call mpp_get_F2C_index(nest_domain, is_cy, ie_cy, js_cy, je_cy, is_fy, ie_fy, js_fy, je_fy, l, position=NORTH)
+          allocate(x(is_cx:ie_cx, js_cx:je_cx, nz))
+          allocate(y(is_cy:ie_cy, js_cy:je_cy, nz))
+          x = 0
+          y = 0
+          do k = 1, nz
+             do j = js_cx, je_cx
+                do i = is_cx, ie_cx
+                   x(i,j,k) = i*1.e+6 + j*1.e+3 + k + 1.0E-6
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = js_cy, je_cy
+                do i = is_cy, ie_cy
+                   y(i,j,k) = i*1.e+6 + j*1.e+3 + k + 2.0E-6
+                enddo
+             enddo
+          enddo
+       else
+          allocate(x1(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse, nz))
+          allocate(x2(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse, nz))
+          allocate(y1(isd_coarse:ied_coarse, jsd_coarse:jed_coarse+shift, nz))
+          allocate(y2(isd_coarse:ied_coarse, jsd_coarse:jed_coarse+shift, nz))
+          x1 = 0
+          tile = my_tile_id(1)
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse
+                do i = isc_coarse, iec_coarse+shift
+                   x1(i,j,k) = i*1.e+6 + j*1.e+3 + k + 0.001
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse+shift
+                do i = isc_coarse, iec_coarse
+                   y1(i,j,k) = i*1.e+6 + j*1.e+3 + k + 0.002
+                enddo
+             enddo
+          enddo
+          x2 = x1
+          y2 = y1
+       endif
+
+
+       if(is_coarse_pe) then
+          do n = 1, nnest
+             is_c = max(is_coarse(n), isc_coarse)
+             ie_c = min(ie_coarse(n),   iec_coarse)
+             js_c = max(js_coarse(n), jsc_coarse)
+             je_c = min(je_coarse(n),   jec_coarse)
+             if( tile == t_coarse(n) .AND. ie_c+shift .GE. is_c .AND. je_c .GE. js_c ) then
+                call fill_coarse_data(x2, rotate_coarse(n), iadd_coarse(n), jadd_coarse(n), &
+                     is_c, ie_c, js_c, je_c, nz, isd_coarse, jsd_coarse, nx, ny, shift, 0, 1.0E-6, 2.0E-6, 1, -1, &
+                     x_cyclic, .false., iend_coarse(1)+1, jend_coarse(1)+1)
+             endif
+             if( tile == t_coarse(n) .AND. ie_c .GE. is_c .AND. je_c+shift .GE. js_c ) then
+                call fill_coarse_data(y2, rotate_coarse(n), iadd_coarse(n), jadd_coarse(n), &
+                     is_c, ie_c, js_c, je_c, nz, isd_coarse, jsd_coarse, nx, ny, 0, shift, 2.0E-6, 1.0E-6, -1, 1, &
+                     .false., y_cyclic, iend_coarse(1)+1, jend_coarse(1)+1)
+             endif
+          enddo
+       endif
+
+       call mpp_update_nest_coarse(x, y, nest_domain, x1, y1, nest_level=l, gridtype=CGRID_NE)
+
+       !--- compare with assumed value
+       if( is_coarse_pe) then
+          call compare_checksums(x1, x2, trim(type2)//' fine to coarse buffer CGRID Vector X')
+          call compare_checksums(x1, x2, trim(type2)//' fine to coarse buffer CGRID Vector Y')
+       endif
+       if(allocated(x))       deallocate(x)
+       if(allocated(x1))      deallocate(x1)
+       if(allocated(x2))      deallocate(x2)
+       if(allocated(y))       deallocate(y)
+       if(allocated(y1))      deallocate(y1)
+       if(allocated(y2))      deallocate(y2)
+
+       !---------------------------------------------------------------------------
+       !
+       !                    fine to coarse DGRID vector update
+       !
+       !---------------------------------------------------------------------------
+       shift = 1
+
+       if(is_fine_pe) then
+          call mpp_get_F2C_index(nest_domain, is_cx, ie_cx, js_cx, je_cx, is_fx, ie_fx, js_fx, je_fx, l, position=NORTH)
+          call mpp_get_F2C_index(nest_domain, is_cy, ie_cy, js_cy, je_cy, is_fy, ie_fy, js_fy, je_fy, l, position=EAST)
+          allocate(x(is_cx:ie_cx, js_cx:je_cx, nz))
+          allocate(y(is_cy:ie_cy, js_cy:je_cy, nz))
+          x = 0
+          y = 0
+          do k = 1, nz
+             do j = js_cx, je_cx
+                do i = is_cx, ie_cx
+                   x(i,j,k) = i*1.e+6 + j*1.e+3 + k + 1.0E-6
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = js_cy, je_cy
+                do i = is_cy, ie_cy
+                   y(i,j,k) = i*1.e+6 + j*1.e+3 + k + 2.0E-6
+                enddo
+             enddo
+          enddo
+       else
+          allocate(x1(isd_coarse:ied_coarse, jsd_coarse:jed_coarse+shift, nz))
+          allocate(x2(isd_coarse:ied_coarse, jsd_coarse:jed_coarse+shift, nz))
+          allocate(y1(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse, nz))
+          allocate(y2(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse, nz))
+          x1 = 0
+          tile = my_tile_id(1)
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse+shift
+                do i = isc_coarse, iec_coarse
+                   x1(i,j,k) = i*1.e+6 + j*1.e+3 + k + 0.001
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse
+                do i = isc_coarse, iec_coarse+shift
+                   y1(i,j,k) = i*1.e+6 + j*1.e+3 + k + 0.002
+                enddo
+             enddo
+          enddo
+          x2 = x1
+          y2 = y1
+       endif
+
+
+       if(is_coarse_pe) then
+          do n = 1, nnest
+             is_c = max(is_coarse(n), isc_coarse)
+             ie_c = min(ie_coarse(n),   iec_coarse)
+             js_c = max(js_coarse(n), jsc_coarse)
+             je_c = min(je_coarse(n),   jec_coarse)
+             if( tile == t_coarse(n) .AND. ie_c .GE. is_c .AND. je_c+shift .GE. js_c ) then
+                call fill_coarse_data(x2, rotate_coarse(n), iadd_coarse(n), jadd_coarse(n), &
+                     is_c, ie_c, js_c, je_c, nz, isd_coarse, jsd_coarse, nx, ny, 0, shift, 1.0E-6, 2.0E-6, 1, -1, &
+                     .false., y_cyclic, iend_coarse(1), jend_coarse(1) )
+             endif
+             if( tile == t_coarse(n) .AND. ie_c+shift .GE. is_c .AND. je_c .GE. js_c ) then
+                call fill_coarse_data(y2, rotate_coarse(n), iadd_coarse(n), jadd_coarse(n), &
+                     is_c, ie_c, js_c, je_c, nz, isd_coarse, jsd_coarse, nx, ny, shift, 0, 2.0E-6, 1.0E-6, -1, 1, &
+                     x_cyclic, .false., iend_coarse(1), jend_coarse(1))
+             endif
+          enddo
+       endif
+
+       call mpp_update_nest_coarse(x, y, nest_domain, x1, y1, nest_level=l, gridtype=DGRID_NE)
+
+       !--- compare with assumed value
+       if( is_coarse_pe) then
+          call compare_checksums(x1, x2, trim(type2)//' fine to coarse buffer DGRID Vector X')
+          call compare_checksums(x1, x2, trim(type2)//' fine to coarse buffer DGRID Vector Y')
+       endif
+       if(allocated(x))       deallocate(x)
+       if(allocated(x1))      deallocate(x1)
+       if(allocated(x2))      deallocate(x2)
+       if(allocated(y))       deallocate(y)
+       if(allocated(y1))      deallocate(y1)
+       if(allocated(y2))      deallocate(y2)
+
+
+       !---------------------------------------------------------------------------
+       !
+       !                 Coarse to Fine scalar field, position = CENTER
+       !
+       !---------------------------------------------------------------------------
+
+       !--- first check the index is correct or not
+       !--- The index from nest domain
+       call mpp_get_C2F_index(nest_domain, isw_f, iew_f, jsw_f, jew_f, isw_c, iew_c, jsw_c, jew_c, WEST, l)
+       call mpp_get_C2F_index(nest_domain, ise_f, iee_f, jse_f, jee_f, ise_c, iee_c, jse_c, jee_c, EAST, l)
+       call mpp_get_C2F_index(nest_domain, iss_f, ies_f, jss_f, jes_f, iss_c, ies_c, jss_c, jes_c, SOUTH, l)
+       call mpp_get_C2F_index(nest_domain, isn_f, ien_f, jsn_f, jen_f, isn_c, ien_c, jsn_c, jen_c, NORTH, l)
+
+       if(is_fine_pe) then
+          call mpp_get_compute_domain(domain, isc_fine, iec_fine, jsc_fine, jec_fine)
+          call mpp_get_data_domain(domain, isd_fine, ied_fine, jsd_fine, jed_fine)
+
+          !-- The assumed index
+          isw_f2 = 0; iew_f2 = -1; jsw_f2 = 0; jew_f2 = -1
+          isw_c2 = 0; iew_c2 = -1; jsw_c2 = 0; jew_c2 = -1
+          ise_f2 = 0; iee_f2 = -1; jse_f2 = 0; jee_f2 = -1
+          ise_c2 = 0; iee_c2 = -1; jse_c2 = 0; jee_c2 = -1
+          iss_f2 = 0; ies_f2 = -1; jss_f2 = 0; jes_f2 = -1
+          iss_c2 = 0; ies_c2 = -1; jss_c2 = 0; jes_c2 = -1
+          isn_f2 = 0; ien_f2 = -1; jsn_f2 = 0; jen_f2 = -1
+          isn_c2 = 0; ien_c2 = -1; jsn_c2 = 0; jen_c2 = -1
+
+          !--- west
+          if( isc_fine == 1 ) then
+             isw_f2 = isd_fine; iew_f2 = isc_fine - 1
+             jsw_f2 = jsd_fine; jew_f2 = jed_fine
+             isw_c2 = istart_coarse(my_fine_id)-whalo
+             iew_c2 = istart_coarse(my_fine_id)
+             jsw_c2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jew_c2 = jstart_coarse(my_fine_id) + (jec_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo
+          endif
+          !--- east
+          if( iec_fine == nx_fine ) then
+             ise_f2 = iec_fine+1; iee_f2 = ied_fine
+             jse_f2 = jsd_fine;   jee_f2 = jed_fine
+             ise_c2 = iend_coarse(my_fine_id)
+             iee_c2 = iend_coarse(my_fine_id)+ehalo
+             jse_c2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jee_c2 = jstart_coarse(my_fine_id) + (jec_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo
+          endif
+          !--- south
+          if( jsc_fine == 1 ) then
+             iss_f2 = isd_fine; ies_f2 = ied_fine
+             jss_f2 = jsd_fine; jes_f2 = jsc_fine - 1
+             iss_c2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ies_c2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo
+             jss_c2 = jstart_coarse(my_fine_id)-shalo
+             jes_c2 = jstart_coarse(my_fine_id)
+          endif
+          !--- north
+          if( jec_fine == ny_fine ) then
+             isn_f2 = isd_fine;  ien_f2 = ied_fine
+             jsn_f2 = jec_fine+1; jen_f2 = jed_fine
+             isn_c2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ien_c2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo
+             jsn_c2 = jend_coarse(my_fine_id)
+             jen_c2 = jend_coarse(my_fine_id)+nhalo
+          endif
+
+          if( isw_f .NE. isw_f2 .OR. iew_f .NE. iew_f2 .OR. jsw_f .NE. jsw_f2 .OR. jew_f .NE. jew_f2 .OR. &
+               isw_c .NE. isw_c2 .OR. iew_c .NE. iew_c2 .OR. jsw_c .NE. jsw_c2 .OR. jew_c .NE. jew_c2 ) then
+             write(5000+mpp_pe(),*), "west buffer fine index = ", isw_f, iew_f, jsw_f, jew_f
+             write(5000+mpp_pe(),*), "west buffer fine index2 = ", isw_f2, iew_f2, jsw_f2, jew_f2
+             write(5000+mpp_pe(),*), "west buffer coarse index = ", isw_c, iew_c, jsw_c, jew_c
+             write(5000+mpp_pe(),*), "west buffer coarse index2 = ", isw_c2, iew_c2, jsw_c2, jew_c2
+             call mpp_error(FATAL, "test_mpp_domains: west buffer index mismatch for coarse to fine scalar")
+          endif
+          if( ise_f .NE. ise_f2 .OR. iee_f .NE. iee_f2 .OR. jse_f .NE. jse_f2 .OR. jee_f .NE. jee_f2 .OR. &
+               ise_c .NE. ise_c2 .OR. iee_c .NE. iee_c2 .OR. jse_c .NE. jse_c2 .OR. jee_c .NE. jee_c2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: east buffer index mismatch for coarse to fine scalar")
+          endif
+          if( iss_f .NE. iss_f2 .OR. ies_f .NE. ies_f2 .OR. jss_f .NE. jss_f2 .OR. jes_f .NE. jes_f2 .OR. &
+               iss_c .NE. iss_c2 .OR. ies_c .NE. ies_c2 .OR. jss_c .NE. jss_c2 .OR. jes_c .NE. jes_c2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: south buffer index mismatch for coarse to fine scalar")
+          endif
+          if( isn_f .NE. isn_f2 .OR. ien_f .NE. ien_f2 .OR. jsn_f .NE. jsn_f2 .OR. jen_f .NE. jen_f2 .OR. &
+               isn_c .NE. isn_c2 .OR. ien_c .NE. ien_c2 .OR. jsn_c .NE. jsn_c2 .OR. jen_c .NE. jen_c2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: north buffer index mismatch for coarse to fine scalar")
+          endif
+       endif
+
+       if(is_coarse_pe) then
+          call mpp_get_compute_domain(domain, isc_coarse, iec_coarse, jsc_coarse, jec_coarse)
+          call mpp_get_data_domain(domain, isd_coarse, ied_coarse, jsd_coarse, jed_coarse)
+          allocate(x(isd_coarse:ied_coarse, jsd_coarse:jed_coarse, nz))
+          x = 0
+          tile = my_tile_id(1)
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse
+                do i = isc_coarse, iec_coarse
+                   x(i,j,k) = tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+       else
+          allocate(x(isd_fine:ied_fine, jsd_fine:jed_fine, nz))
+          x = 0
+          do k = 1, nz
+             do j = jsc_fine, jec_fine
+                do i = isc_fine, iec_fine
+                   x(i,j,k) = i*1.e+6 + j*1.e+3 + k
+                enddo
+             enddo
+          enddo
+       endif
+
+       if( iew_c .GE. isw_c .AND. jew_c .GE. jsw_c ) then
+          allocate(wbuffer(isw_c:iew_c, jsw_c:jew_c,nz))
+          allocate(wbuffer2(isw_c:iew_c, jsw_c:jew_c,nz))
+       else
+          allocate(wbuffer(1,1,1))
+          allocate(wbuffer2(1,1,1))
+       endif
+
+       if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
+          allocate(ebuffer(ise_c:iee_c, jse_c:jee_c,nz))
+          allocate(ebuffer2(ise_c:iee_c, jse_c:jee_c,nz))
+       else
+          allocate(ebuffer(1,1,1))
+          allocate(ebuffer2(1,1,1))
+       endif
+
+       if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
+          allocate(sbuffer(iss_c:ies_c, jss_c:jes_c,nz))
+          allocate(sbuffer2(iss_c:ies_c, jss_c:jes_c,nz))
+       else
+          allocate(sbuffer(1,1,1))
+          allocate(sbuffer2(1,1,1))
+       endif
+
+       if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
+          allocate(nbuffer(isn_c:ien_c, jsn_c:jen_c,nz))
+          allocate(nbuffer2(isn_c:ien_c, jsn_c:jen_c,nz))
+       else
+          allocate(nbuffer(1,1,1))
+          allocate(nbuffer2(1,1,1))
+       endif
+       ebuffer = 0; ebuffer2 = 0
+       wbuffer = 0; wbuffer2 = 0
+       sbuffer = 0; sbuffer2 = 0
+       nbuffer = 0; nbuffer2 = 0
+
+       call mpp_update_nest_fine(x, nest_domain, wbuffer, sbuffer, ebuffer, nbuffer, nest_level=l)
+
+       !--- compare with the assumed value.
+       if( is_fine_pe ) then
+          call mpp_set_current_pelist(my_pelist_fine)
+          if( iew_c .GE. isw_c .AND. jew_c .GE. jsw_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(wbuffer2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
+          endif
+          call compare_checksums(wbuffer, wbuffer2, trim(type2)//' west buffer coarse to fine scalar')
+
+          if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(sbuffer2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
+          endif
+          call compare_checksums(sbuffer, sbuffer2, trim(type2)//' south buffer coarse to fine scalar')
+
+          if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(ebuffer2, ise_c, iee_c, jse_c, jee_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
+          endif
+          call compare_checksums(ebuffer, ebuffer2, trim(type2)//' east buffer coarse to fine scalar')
+
+          if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(nbuffer2, isn_c, ien_c, jsn_c, jen_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
+          endif
+          call compare_checksums(nbuffer, nbuffer2, trim(type2)//' north buffer coarse to fine scalar')
+       endif
+       if(is_fine_pe) then
+          deallocate(wbuffer, ebuffer, sbuffer, nbuffer)
+          deallocate(wbuffer2, ebuffer2, sbuffer2, nbuffer2)
+       endif
+       deallocate(x)
+
+       !---------------------------------------------------------------------------
+       !
+       !                    coarse to fine BGRID scalar pair update
+       !
+       !---------------------------------------------------------------------------
+       shift = 1
+       !--- first check the index is correct or not
+       if(is_fine_pe) then
+          !--- The index from nest domain
+          call mpp_get_compute_domain(domain_fine, isc_fine, iec_fine, jsc_fine, jec_fine)
+          call mpp_get_data_domain(domain_fine, isd_fine, ied_fine, jsd_fine, jed_fine)
+          call mpp_get_C2F_index(nest_domain, isw_fx, iew_fx, jsw_fx, jew_fx, isw_cx, iew_cx, jsw_cx, jew_cx, WEST, l, position=CORNER)
+          call mpp_get_C2F_index(nest_domain, ise_fx, iee_fx, jse_fx, jee_fx, ise_cx, iee_cx, jse_cx, jee_cx, EAST, l, position=CORNER)
+          call mpp_get_C2F_index(nest_domain, iss_fx, ies_fx, jss_fx, jes_fx, iss_cx, ies_cx, jss_cx, jes_cx, SOUTH, l, position=CORNER)
+          call mpp_get_C2F_index(nest_domain, isn_fx, ien_fx, jsn_fx, jen_fx, isn_cx, ien_cx, jsn_cx, jen_cx, NORTH, l, position=CORNER)
+          call mpp_get_C2F_index(nest_domain, isw_fy, iew_fy, jsw_fy, jew_fy, isw_cy, iew_cy, jsw_cy, jew_cy, WEST, l, position=CORNER)
+          call mpp_get_C2F_index(nest_domain, ise_fy, iee_fy, jse_fy, jee_fy, ise_cy, iee_cy, jse_cy, jee_cy, EAST, l, position=CORNER)
+          call mpp_get_C2F_index(nest_domain, iss_fy, ies_fy, jss_fy, jes_fy, iss_cy, ies_cy, jss_cy, jes_cy, SOUTH, l, position=CORNER)
+          call mpp_get_C2F_index(nest_domain, isn_fy, ien_fy, jsn_fy, jen_fy, isn_cy, ien_cy, jsn_cy, jen_cy, NORTH, l, position=CORNER)
+
+          !-- The assumed index
+          isw_fx2 = 0; iew_fx2 = -1; jsw_fx2 = 0; jew_fx2 = -1
+          isw_cx2 = 0; iew_cx2 = -1; jsw_cx2 = 0; jew_cx2 = -1
+          ise_fx2 = 0; iee_fx2 = -1; jse_fx2 = 0; jee_fx2 = -1
+          ise_cx2 = 0; iee_cx2 = -1; jse_cx2 = 0; jee_cx2 = -1
+          iss_fx2 = 0; ies_fx2 = -1; jss_fx2 = 0; jes_fx2 = -1
+          iss_cx2 = 0; ies_cx2 = -1; jss_cx2 = 0; jes_cx2 = -1
+          isn_fx2 = 0; ien_fx2 = -1; jsn_fx2 = 0; jen_fx2 = -1
+          isn_cx2 = 0; ien_cx2 = -1; jsn_cx2 = 0; jen_cx2 = -1
+          isw_fy2 = 0; iew_fy2 = -1; jsw_fy2 = 0; jew_fy2 = -1
+          isw_cy2 = 0; iew_cy2 = -1; jsw_cy2 = 0; jew_cy2 = -1
+          ise_fy2 = 0; iee_fy2 = -1; jse_fy2 = 0; jee_fy2 = -1
+          ise_cy2 = 0; iee_cy2 = -1; jse_cy2 = 0; jee_cy2 = -1
+          iss_fy2 = 0; ies_fy2 = -1; jss_fy2 = 0; jes_fy2 = -1
+          iss_cy2 = 0; ies_cy2 = -1; jss_cy2 = 0; jes_cy2 = -1
+          isn_fy2 = 0; ien_fy2 = -1; jsn_fy2 = 0; jen_fy2 = -1
+          isn_cy2 = 0; ien_cy2 = -1; jsn_cy2 = 0; jen_cy2 = -1
+
+          !--- west
+          if( isc_fine == 1 ) then
+             isw_fx2 = isd_fine
+             iew_fx2 = isc_fine - 1
+             jsw_fx2 = jsd_fine
+             jew_fx2 = jed_fine + shift
+             isw_cx2 = istart_coarse(my_fine_id)-whalo
+             iew_cx2 = istart_coarse(my_fine_id)
+             jsw_cx2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jew_cx2 = jstart_coarse(my_fine_id) + (jec_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo + shift
+             isw_fy2 = isd_fine
+             iew_fy2 = isc_fine - 1
+             jsw_fy2 = jsd_fine
+             jew_fy2 = jed_fine + shift
+             isw_cy2 = istart_coarse(my_fine_id)-whalo
+             iew_cy2 = istart_coarse(my_fine_id)
+             jsw_cy2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jew_cy2 = jstart_coarse(my_fine_id) + (jec_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo + shift
+          endif
+          !--- east
+          if( iec_fine == nx_fine ) then
+             ise_fx2 = iec_fine+1+shift
+             iee_fx2 = ied_fine + shift
+             jse_fx2 = jsd_fine
+             jee_fx2 = jed_fine + shift
+             ise_cx2 = iend_coarse(my_fine_id)+shift
+             iee_cx2 = iend_coarse(my_fine_id)+ehalo+shift
+             jse_cx2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jee_cx2 = jstart_coarse(my_fine_id) + (jec_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo + shift
+             ise_fy2 = iec_fine+1 + shift
+             iee_fy2 = ied_fine + shift
+             jse_fy2 = jsd_fine
+             jee_fy2 = jed_fine + shift
+             ise_cy2 = iend_coarse(my_fine_id) + shift
+             iee_cy2 = iend_coarse(my_fine_id)+ehalo + shift
+             jse_cy2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jee_cy2 = jstart_coarse(my_fine_id) + (jec_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo + shift
+          endif
+          !--- south
+          if( jsc_fine == 1 ) then
+             iss_fx2 = isd_fine
+             ies_fx2 = ied_fine + shift
+             jss_fx2 = jsd_fine
+             jes_fx2 = jsc_fine - 1
+             iss_cx2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ies_cx2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo + shift
+             jss_cx2 = jstart_coarse(my_fine_id)-shalo
+             jes_cx2 = jstart_coarse(my_fine_id)
+             iss_fy2 = isd_fine
+             ies_fy2 = ied_fine + shift
+             jss_fy2 = jsd_fine
+             jes_fy2 = jsc_fine - 1
+             iss_cy2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ies_cy2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo + shift
+             jss_cy2 = jstart_coarse(my_fine_id)-shalo
+             jes_cy2 = jstart_coarse(my_fine_id)
+          endif
+          !--- north
+          if( jec_fine == ny_fine ) then
+             isn_fx2 = isd_fine
+             ien_fx2 = ied_fine + shift
+             jsn_fx2 = jec_fine+1 + shift
+             jen_fx2 = jed_fine + shift
+             isn_cx2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ien_cx2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo + shift
+             jsn_cx2 = jend_coarse(my_fine_id) + shift
+             jen_cx2 = jend_coarse(my_fine_id)+nhalo + shift
+             isn_fy2 = isd_fine
+             ien_fy2 = ied_fine + shift
+             jsn_fy2 = jec_fine+1 + shift
+             jen_fy2 = jed_fine + shift
+             isn_cy2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ien_cy2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo + shift
+             jsn_cy2 = jend_coarse(my_fine_id) + shift
+             jen_cy2 = jend_coarse(my_fine_id)+nhalo + shift
+          endif
+
+          if( isw_fx .NE. isw_fx2 .OR. iew_fx .NE. iew_fx2 .OR. jsw_fx .NE. jsw_fx2 .OR. jew_fx .NE. jew_fx2 .OR. &
+               isw_cx .NE. isw_cx2 .OR. iew_cx .NE. iew_cx2 .OR. jsw_cx .NE. jsw_cx2 .OR. jew_cx .NE. jew_cx2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: west buffer index mismatch for coarse to fine BGRID X")
+          endif
+          if( ise_fx .NE. ise_fx2 .OR. iee_fx .NE. iee_fx2 .OR. jse_fx .NE. jse_fx2 .OR. jee_fx .NE. jee_fx2 .OR. &
+               ise_cx .NE. ise_cx2 .OR. iee_cx .NE. iee_cx2 .OR. jse_cx .NE. jse_cx2 .OR. jee_cx .NE. jee_cx2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: east buffer index mismatch for coarse to fine BGRID X")
+          endif
+          if( iss_fx .NE. iss_fx2 .OR. ies_fx .NE. ies_fx2 .OR. jss_fx .NE. jss_fx2 .OR. jes_fx .NE. jes_fx2 .OR. &
+               iss_cx .NE. iss_cx2 .OR. ies_cx .NE. ies_cx2 .OR. jss_cx .NE. jss_cx2 .OR. jes_cx .NE. jes_cx2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: south buffer index mismatch for coarse to fine BGRID X")
+          endif
+          if( isn_fx .NE. isn_fx2 .OR. ien_fx .NE. ien_fx2 .OR. jsn_fx .NE. jsn_fx2 .OR. jen_fx .NE. jen_fx2 .OR. &
+               isn_cx .NE. isn_cx2 .OR. ien_cx .NE. ien_cx2 .OR. jsn_cx .NE. jsn_cx2 .OR. jen_cx .NE. jen_cx2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: north buffer index mismatch for coarse to fine BGRID X")
+          endif
+
+          if( isw_fy .NE. isw_fy2 .OR. iew_fy .NE. iew_fy2 .OR. jsw_fy .NE. jsw_fy2 .OR. jew_fy .NE. jew_fy2 .OR. &
+               isw_cy .NE. isw_cy2 .OR. iew_cy .NE. iew_cy2 .OR. jsw_cy .NE. jsw_cy2 .OR. jew_cy .NE. jew_cy2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: west buffer index mismatch for coarse to fine BGRID Y")
+          endif
+          if( ise_fy .NE. ise_fy2 .OR. iee_fy .NE. iee_fy2 .OR. jse_fy .NE. jse_fy2 .OR. jee_fy .NE. jee_fy2 .OR. &
+               ise_cy .NE. ise_cy2 .OR. iee_cy .NE. iee_cy2 .OR. jse_cy .NE. jse_cy2 .OR. jee_cy .NE. jee_cy2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: east buffer index mismatch for coarse to fine BGRID Y")
+          endif
+          if( iss_fy .NE. iss_fy2 .OR. ies_fy .NE. ies_fy2 .OR. jss_fy .NE. jss_fy2 .OR. jes_fy .NE. jes_fy2 .OR. &
+               iss_cy .NE. iss_cy2 .OR. ies_cy .NE. ies_cy2 .OR. jss_cy .NE. jss_cy2 .OR. jes_cy .NE. jes_cy2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: south buffer index mismatch for coarse to fine BGRID Y")
+          endif
+          if( isn_fy .NE. isn_fy2 .OR. ien_fy .NE. ien_fy2 .OR. jsn_fy .NE. jsn_fy2 .OR. jen_fy .NE. jen_fy2 .OR. &
+               isn_cy .NE. isn_cy2 .OR. ien_cy .NE. ien_cy2 .OR. jsn_cy .NE. jsn_cy2 .OR. jen_cy .NE. jen_cy2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: north buffer index mismatch for coarse to fine BGRID Y")
+          endif
+       endif
+
+       if(is_coarse_pe) then
+          call mpp_get_compute_domain(domain_coarse, isc_coarse, iec_coarse, jsc_coarse, jec_coarse)
+          call mpp_get_data_domain(domain_coarse, isd_coarse, ied_coarse, jsd_coarse, jed_coarse)
+          allocate(x(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse+shift, nz))
+          allocate(y(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse+shift, nz))
+          x = 0
+          y = 0
+          tile = my_tile_id(1)
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse+shift
+                do i = isc_coarse, iec_coarse+shift
+                   x(i,j,k) = 1e3 + tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse+shift
+                do i = isc_coarse, iec_coarse+shift
+                   y(i,j,k) = 2e3 + tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+       else
+          allocate(x(isd_fine:ied_fine+shift, jsd_fine:jed_fine+shift, nz))
+          allocate(y(isd_fine:ied_fine+shift, jsd_fine:jed_fine+shift, nz))
+          x = 0
+          y = 0
+          do k = 1, nz
+             do j = jsc_fine, jec_fine+shift
+                do i = isc_fine, iec_fine+shift
+                   x(i,j,k) = i*1.e+6 + j*1.e+3 + k + 1e-3
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_fine, jec_fine+shift
+                do i = isc_fine, iec_fine+shift
+                   y(i,j,k) = i*1.e+6 + j*1.e+3 + k + 2e-3
+                enddo
+             enddo
+          enddo
+       endif
+
+       if(is_fine_pe) then
+          if( iew_cx .GE. isw_cx .AND. jew_cx .GE. jsw_cx ) then
+             allocate(wbufferx(isw_cx:iew_cx, jsw_cx:jew_cx,nz))
+             allocate(wbuffery(isw_cy:iew_cy, jsw_cy:jew_cy,nz))
+             allocate(wbufferx2(isw_cx:iew_cx, jsw_cx:jew_cx,nz))
+             allocate(wbuffery2(isw_cy:iew_cy, jsw_cy:jew_cy,nz))
+          else
+             allocate(wbufferx(1,1,1))
+             allocate(wbuffery(1,1,1))
+             allocate(wbufferx2(1,1,1))
+             allocate(wbuffery2(1,1,1))
+          endif
+          if( iee_cx .GE. ise_cx .AND. jee_cx .GE. jse_cx ) then
+             allocate(ebufferx(ise_cx:iee_cx, jse_cx:jee_cx,nz))
+             allocate(ebuffery(ise_cy:iee_cy, jse_cy:jee_cy,nz))
+             allocate(ebufferx2(ise_cx:iee_cx, jse_cx:jee_cx,nz))
+             allocate(ebuffery2(ise_cy:iee_cy, jse_cy:jee_cy,nz))
+          else
+             allocate(ebufferx(1,1,1))
+             allocate(ebuffery(1,1,1))
+             allocate(ebufferx2(1,1,1))
+             allocate(ebuffery2(1,1,1))
+          endif
+          if( ies_cx .GE. iss_cx .AND. jes_cx .GE. jss_cx ) then
+             allocate(sbufferx(iss_cx:ies_cx, jss_cx:jes_cx,nz))
+             allocate(sbuffery(iss_cy:ies_cy, jss_cy:jes_cy,nz))
+             allocate(sbufferx2(iss_cx:ies_cx, jss_cx:jes_cx,nz))
+             allocate(sbuffery2(iss_cy:ies_cy, jss_cy:jes_cy,nz))
+          else
+             allocate(sbufferx(1,1,1))
+             allocate(sbuffery(1,1,1))
+             allocate(sbufferx2(1,1,1))
+             allocate(sbuffery2(1,1,1))
+          endif
+          if( ien_cx .GE. isn_cx .AND. jen_cx .GE. jsn_cx ) then
+             allocate(nbufferx(isn_cx:ien_cx, jsn_cx:jen_cx,nz))
+             allocate(nbuffery(isn_cy:ien_cy, jsn_cy:jen_cy,nz))
+             allocate(nbufferx2(isn_cx:ien_cx, jsn_cx:jen_cx,nz))
+             allocate(nbuffery2(isn_cy:ien_cy, jsn_cy:jen_cy,nz))
+          else
+             allocate(nbufferx(1,1,1))
+             allocate(nbuffery(1,1,1))
+             allocate(nbufferx2(1,1,1))
+             allocate(nbuffery2(1,1,1))
+          endif
+          wbufferx = 0; wbufferx2 = 0
+          wbuffery = 0; wbuffery2 = 0
+          sbufferx = 0; sbufferx2 = 0
+          sbuffery = 0; sbuffery2 = 0
+          ebufferx = 0; ebufferx2 = 0
+          ebuffery = 0; ebuffery2 = 0
+          nbufferx = 0; nbufferx2 = 0
+          nbuffery = 0; nbuffery2 = 0
+       endif
+       call mpp_update_nest_fine(x, y, nest_domain, wbufferx, wbuffery, sbufferx, sbuffery, ebufferx, ebuffery, &
+            nbufferx, nbuffery, nest_level=l, gridtype=BGRID_NE, flags=SCALAR_PAIR)
+
+       !--- compare with the assumed value.
+       if( is_fine_pe ) then
+          call mpp_set_current_pelist(my_pelist_fine)
+          if( iew_c .GE. isw_c .AND. jew_c .GE. jsw_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(wbufferx2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
+             call fill_nest_data(wbuffery2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
+          endif
+          if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(sbufferx2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
+             call fill_nest_data(sbuffery2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
+          endif
+          if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(ebufferx2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
+             call fill_nest_data(ebuffery2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
+          endif
+          if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(nbufferx2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
+             call fill_nest_data(nbuffery2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
+          endif
+
+          call compare_checksums(wbufferx, wbufferx2, trim(type2)//' west buffer coarse to fine BGRID scalar pair X')
+          call compare_checksums(wbuffery, wbuffery2, trim(type2)//' west buffer coarse to fine BGRID scalar pair Y')
+          call compare_checksums(sbufferx, sbufferx2, trim(type2)//' south buffer coarse to fine BGRID scalar pair X')
+          call compare_checksums(sbuffery, sbuffery2, trim(type2)//' south buffer coarse to fine BGRID scalar pair Y')
+          call compare_checksums(ebufferx, ebufferx2, trim(type2)//' east buffer coarse to fine BGRID scalar pair X')
+          call compare_checksums(ebuffery, ebuffery2, trim(type2)//' east buffer coarse to fine BGRID scalar pair Y')
+          call compare_checksums(nbufferx, nbufferx2, trim(type2)//' north buffer coarse to fine BGRID scalar pair X')
+          call compare_checksums(nbuffery, nbuffery2, trim(type2)//' north buffer coarse to fine BGRID scalar pair Y')
+       endif
+       if(allocated(x)) deallocate(x)
+       if(allocated(y)) deallocate(y)
+       if(is_fine_pe) then
+          deallocate(wbufferx, ebufferx, sbufferx, nbufferx)
+          deallocate(wbufferx2, ebufferx2, sbufferx2, nbufferx2)
+          deallocate(wbuffery, ebuffery, sbuffery, nbuffery)
+          deallocate(wbuffery2, ebuffery2, sbuffery2, nbuffery2)
+       endif
+
+       !---------------------------------------------------------------------------
+       !
+       !                 Coarse to Fine scalar field, position = CORNER
+       !
+       !---------------------------------------------------------------------------
+
+       if(is_coarse_pe) then
+          call mpp_get_compute_domain(domain_coarse, isc_coarse, iec_coarse, jsc_coarse, jec_coarse)
+          call mpp_get_data_domain(domain_coarse, isd_coarse, ied_coarse, jsd_coarse, jed_coarse)
+          allocate(x(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse+shift, nz))
+          x = 0
+          tile = my_tile_id(1)
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse+shift
+                do i = isc_coarse, iec_coarse+shift
+                   x(i,j,k) = tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+       else
+          allocate(x(isd_fine:ied_fine+shift, jsd_fine:jed_fine+shift, nz))
+          x = 0
+          do k = 1, nz
+             do j = jsc_fine, jec_fine+shift
+                do i = isc_fine, iec_fine+shift
+                   x(i,j,k) = i*1.e+6 + j*1.e+3 + k
+                enddo
+             enddo
+          enddo
+       endif
+
+       if(is_fine_pe) then
+          if( iew_c .GE. isw_c .AND. jew_c .GE. jsw_c ) then
+             allocate(wbuffer(isw_cx:iew_cx, jsw_cx:jew_cx,nz))
+             allocate(wbuffer2(isw_cx:iew_cx, jsw_cx:jew_cx,nz))
+          else
+             allocate(wbuffer(1,1,1))
+             allocate(wbuffer2(1,1,1))
+          endif
+          wbuffer = 0; wbuffer2 = 0
+
+          if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
+             allocate(ebuffer(ise_cx:iee_cx, jse_cx:jee_cx,nz))
+             allocate(ebuffer2(ise_cx:iee_cx, jse_cx:jee_cx,nz))
+          else
+             allocate(ebuffer(1,1,1))
+             allocate(ebuffer2(1,1,1))
+          endif
+          ebuffer = 0; ebuffer2 = 0
+
+          if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
+             allocate(sbuffer(iss_cx:ies_cx, jss_cx:jes_cx,nz))
+             allocate(sbuffer2(iss_cx:ies_cx, jss_cx:jes_cx,nz))
+          else
+             allocate(sbuffer(1,1,1))
+             allocate(sbuffer2(1,1,1))
+          endif
+          sbuffer = 0; sbuffer2 = 0
+
+          if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
+             allocate(nbuffer(isn_cx:ien_cx, jsn_cx:jen_cx,nz))
+             allocate(nbuffer2(isn_cx:ien_cx, jsn_cx:jen_cx,nz))
+          else
+             allocate(nbuffer(1,1,1))
+             allocate(nbuffer2(1,1,1))
+          endif
+          nbuffer = 0; nbuffer2 = 0
+
+       endif
+
+       call mpp_update_nest_fine(x, nest_domain, wbuffer, sbuffer, ebuffer, nbuffer, nest_level=l, position=CORNER)
+
+       !--- compare with the assumed value.
+       if( is_fine_pe ) then
+          call mpp_set_current_pelist(my_pelist_fine)
+          if( iew_c .GE. isw_c .AND. jew_c .GE. jsw_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(wbuffer2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
+          endif
+          call compare_checksums(wbuffer, wbuffer2, trim(type2)//' west buffer coarse to fine scalar CORNER')
+
+          if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(sbuffer2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
+          endif
+          call compare_checksums(sbuffer, sbuffer2, trim(type2)//' south buffer coarse to fine scalar CORNER')
+
+          if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(ebuffer2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
+          endif
+          call compare_checksums(ebuffer, ebuffer2, trim(type2)//' east buffer coarse to fine scalar CORNER')
+
+          if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(nbuffer2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
+          endif
+          call compare_checksums(nbuffer, nbuffer2, trim(type2)//' north buffer coarse to fine scalar CORNER')
+
+       endif
+       if(is_fine_pe) then
+          deallocate(wbuffer, ebuffer, sbuffer, nbuffer)
+          deallocate(wbuffer2, ebuffer2, sbuffer2, nbuffer2)
+       endif
+       deallocate(x)
+
+
+       !---------------------------------------------------------------------------
+       !
+       !                    coarse to fine CGRID scalar pair update
+       !
+       !---------------------------------------------------------------------------
+       shift = 1
+       !--- first check the index is correct or not
+       if(is_fine_pe) then
+          !--- The index from nest domain
+          call mpp_get_compute_domain(domain_fine, isc_fine, iec_fine, jsc_fine, jec_fine)
+          call mpp_get_data_domain(domain_fine, isd_fine, ied_fine, jsd_fine, jed_fine)
+          call mpp_get_C2F_index(nest_domain, isw_fx, iew_fx, jsw_fx, jew_fx, isw_cx, iew_cx, jsw_cx, jew_cx, WEST, l, position=EAST)
+          call mpp_get_C2F_index(nest_domain, ise_fx, iee_fx, jse_fx, jee_fx, ise_cx, iee_cx, jse_cx, jee_cx, EAST, l, position=EAST)
+          call mpp_get_C2F_index(nest_domain, iss_fx, ies_fx, jss_fx, jes_fx, iss_cx, ies_cx, jss_cx, jes_cx, SOUTH, l, position=EAST)
+          call mpp_get_C2F_index(nest_domain, isn_fx, ien_fx, jsn_fx, jen_fx, isn_cx, ien_cx, jsn_cx, jen_cx, NORTH, l, position=EAST)
+          call mpp_get_C2F_index(nest_domain, isw_fy, iew_fy, jsw_fy, jew_fy, isw_cy, iew_cy, jsw_cy, jew_cy, WEST, l, position=NORTH)
+          call mpp_get_C2F_index(nest_domain, ise_fy, iee_fy, jse_fy, jee_fy, ise_cy, iee_cy, jse_cy, jee_cy, EAST, l, position=NORTH)
+          call mpp_get_C2F_index(nest_domain, iss_fy, ies_fy, jss_fy, jes_fy, iss_cy, ies_cy, jss_cy, jes_cy, SOUTH, l, position=NORTH)
+          call mpp_get_C2F_index(nest_domain, isn_fy, ien_fy, jsn_fy, jen_fy, isn_cy, ien_cy, jsn_cy, jen_cy, NORTH, l, position=NORTH)
+
+          !-- The assumed index
+          isw_fx2 = 0; iew_fx2 = -1; jsw_fx2 = 0; jew_fx2 = -1
+          isw_cx2 = 0; iew_cx2 = -1; jsw_cx2 = 0; jew_cx2 = -1
+          ise_fx2 = 0; iee_fx2 = -1; jse_fx2 = 0; jee_fx2 = -1
+          ise_cx2 = 0; iee_cx2 = -1; jse_cx2 = 0; jee_cx2 = -1
+          iss_fx2 = 0; ies_fx2 = -1; jss_fx2 = 0; jes_fx2 = -1
+          iss_cx2 = 0; ies_cx2 = -1; jss_cx2 = 0; jes_cx2 = -1
+          isn_fx2 = 0; ien_fx2 = -1; jsn_fx2 = 0; jen_fx2 = -1
+          isn_cx2 = 0; ien_cx2 = -1; jsn_cx2 = 0; jen_cx2 = -1
+          isw_fy2 = 0; iew_fy2 = -1; jsw_fy2 = 0; jew_fy2 = -1
+          isw_cy2 = 0; iew_cy2 = -1; jsw_cy2 = 0; jew_cy2 = -1
+          ise_fy2 = 0; iee_fy2 = -1; jse_fy2 = 0; jee_fy2 = -1
+          ise_cy2 = 0; iee_cy2 = -1; jse_cy2 = 0; jee_cy2 = -1
+          iss_fy2 = 0; ies_fy2 = -1; jss_fy2 = 0; jes_fy2 = -1
+          iss_cy2 = 0; ies_cy2 = -1; jss_cy2 = 0; jes_cy2 = -1
+          isn_fy2 = 0; ien_fy2 = -1; jsn_fy2 = 0; jen_fy2 = -1
+          isn_cy2 = 0; ien_cy2 = -1; jsn_cy2 = 0; jen_cy2 = -1
+
+          !--- west
+          if( isc_fine == 1 ) then
+             isw_fx2 = isd_fine
+             iew_fx2 = isc_fine - 1
+             jsw_fx2 = jsd_fine
+             jew_fx2 = jed_fine
+             isw_cx2 = istart_coarse(my_fine_id)-whalo
+             iew_cx2 = istart_coarse(my_fine_id)
+             jsw_cx2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jew_cx2 = jstart_coarse(my_fine_id) + (jec_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo
+             isw_fy2 = isd_fine
+             iew_fy2 = isc_fine - 1
+             jsw_fy2 = jsd_fine
+             jew_fy2 = jed_fine + shift
+             isw_cy2 = istart_coarse(my_fine_id)-whalo
+             iew_cy2 = istart_coarse(my_fine_id)
+             jsw_cy2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jew_cy2 = jstart_coarse(my_fine_id) + (jec_fine + shift - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo
+          endif
+          !--- east
+          if( iec_fine == nx_fine ) then
+             ise_fx2 = iec_fine+1+shift
+             iee_fx2 = ied_fine + shift
+             jse_fx2 = jsd_fine
+             jee_fx2 = jed_fine
+             ise_cx2 = iend_coarse(my_fine_id)+shift
+             iee_cx2 = iend_coarse(my_fine_id)+ehalo+shift
+             jse_cx2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jee_cx2 = jstart_coarse(my_fine_id) + (jec_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo
+             ise_fy2 = iec_fine+1
+             iee_fy2 = ied_fine
+             jse_fy2 = jsd_fine
+             jee_fy2 = jed_fine + shift
+             ise_cy2 = iend_coarse(my_fine_id)
+             iee_cy2 = iend_coarse(my_fine_id)+ehalo
+             jse_cy2 = jstart_coarse(my_fine_id) + (jsc_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) - shalo
+             jee_cy2 = jstart_coarse(my_fine_id) + (jec_fine - jstart_fine(my_fine_id))/y_refine(my_fine_id) + nhalo + shift
+          endif
+          !--- south
+          if( jsc_fine == 1 ) then
+             iss_fx2 = isd_fine
+             ies_fx2 = ied_fine + shift
+             jss_fx2 = jsd_fine
+             jes_fx2 = jsc_fine - 1
+             iss_cx2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ies_cx2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo + shift
+             jss_cx2 = jstart_coarse(my_fine_id)-shalo
+             jes_cx2 = jstart_coarse(my_fine_id)
+             iss_fy2 = isd_fine
+             ies_fy2 = ied_fine
+             jss_fy2 = jsd_fine
+             jes_fy2 = jsc_fine - 1
+             iss_cy2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ies_cy2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo
+             jss_cy2 = jstart_coarse(my_fine_id)-shalo
+             jes_cy2 = jstart_coarse(my_fine_id)
+          endif
+          !--- north
+          if( jec_fine == ny_fine ) then
+             isn_fx2 = isd_fine
+             ien_fx2 = ied_fine + shift
+             jsn_fx2 = jec_fine+1
+             jen_fx2 = jed_fine
+             isn_cx2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ien_cx2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo + shift
+             jsn_cx2 = jend_coarse(my_fine_id)
+             jen_cx2 = jend_coarse(my_fine_id)+nhalo
+             isn_fy2 = isd_fine
+             ien_fy2 = ied_fine
+             jsn_fy2 = jec_fine+1 + shift
+             jen_fy2 = jed_fine + shift
+             isn_cy2 = istart_coarse(my_fine_id) + (isc_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) - whalo
+             ien_cy2 = istart_coarse(my_fine_id) + (iec_fine - istart_fine(my_fine_id))/x_refine(my_fine_id) + ehalo
+             jsn_cy2 = jend_coarse(my_fine_id) + shift
+             jen_cy2 = jend_coarse(my_fine_id)+nhalo + shift
+          endif
+
+          if( isw_fx .NE. isw_fx2 .OR. iew_fx .NE. iew_fx2 .OR. jsw_fx .NE. jsw_fx2 .OR. jew_fx .NE. jew_fx2 .OR. &
+               isw_cx .NE. isw_cx2 .OR. iew_cx .NE. iew_cx2 .OR. jsw_cx .NE. jsw_cx2 .OR. jew_cx .NE. jew_cx2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: west buffer index mismatch for coarse to fine CGRID X")
+          endif
+          if( ise_fx .NE. ise_fx2 .OR. iee_fx .NE. iee_fx2 .OR. jse_fx .NE. jse_fx2 .OR. jee_fx .NE. jee_fx2 .OR. &
+               ise_cx .NE. ise_cx2 .OR. iee_cx .NE. iee_cx2 .OR. jse_cx .NE. jse_cx2 .OR. jee_cx .NE. jee_cx2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: east buffer index mismatch for coarse to fine CGRID X")
+          endif
+          if( iss_fx .NE. iss_fx2 .OR. ies_fx .NE. ies_fx2 .OR. jss_fx .NE. jss_fx2 .OR. jes_fx .NE. jes_fx2 .OR. &
+               iss_cx .NE. iss_cx2 .OR. ies_cx .NE. ies_cx2 .OR. jss_cx .NE. jss_cx2 .OR. jes_cx .NE. jes_cx2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: south buffer index mismatch for coarse to fine CGRID X")
+          endif
+          if( isn_fx .NE. isn_fx2 .OR. ien_fx .NE. ien_fx2 .OR. jsn_fx .NE. jsn_fx2 .OR. jen_fx .NE. jen_fx2 .OR. &
+               isn_cx .NE. isn_cx2 .OR. ien_cx .NE. ien_cx2 .OR. jsn_cx .NE. jsn_cx2 .OR. jen_cx .NE. jen_cx2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: north buffer index mismatch for coarse to fine CGRID X")
+          endif
+
+          if( isw_fy .NE. isw_fy2 .OR. iew_fy .NE. iew_fy2 .OR. jsw_fy .NE. jsw_fy2 .OR. jew_fy .NE. jew_fy2 .OR. &
+               isw_cy .NE. isw_cy2 .OR. iew_cy .NE. iew_cy2 .OR. jsw_cy .NE. jsw_cy2 .OR. jew_cy .NE. jew_cy2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: west buffer index mismatch for coarse to fine CGRID Y")
+          endif
+          if( ise_fy .NE. ise_fy2 .OR. iee_fy .NE. iee_fy2 .OR. jse_fy .NE. jse_fy2 .OR. jee_fy .NE. jee_fy2 .OR. &
+               ise_cy .NE. ise_cy2 .OR. iee_cy .NE. iee_cy2 .OR. jse_cy .NE. jse_cy2 .OR. jee_cy .NE. jee_cy2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: east buffer index mismatch for coarse to fine CGRID Y")
+          endif
+          if( iss_fy .NE. iss_fy2 .OR. ies_fy .NE. ies_fy2 .OR. jss_fy .NE. jss_fy2 .OR. jes_fy .NE. jes_fy2 .OR. &
+               iss_cy .NE. iss_cy2 .OR. ies_cy .NE. ies_cy2 .OR. jss_cy .NE. jss_cy2 .OR. jes_cy .NE. jes_cy2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: south buffer index mismatch for coarse to fine CGRID Y")
+          endif
+          if( isn_fy .NE. isn_fy2 .OR. ien_fy .NE. ien_fy2 .OR. jsn_fy .NE. jsn_fy2 .OR. jen_fy .NE. jen_fy2 .OR. &
+               isn_cy .NE. isn_cy2 .OR. ien_cy .NE. ien_cy2 .OR. jsn_cy .NE. jsn_cy2 .OR. jen_cy .NE. jen_cy2 ) then
+             call mpp_error(FATAL, "test_mpp_domains: north buffer index mismatch for coarse to fine CGRID Y")
+          endif
+       endif
+
+       if(is_coarse_pe) then
+          call mpp_get_compute_domain(domain_coarse, isc_coarse, iec_coarse, jsc_coarse, jec_coarse)
+          call mpp_get_data_domain(domain_coarse, isd_coarse, ied_coarse, jsd_coarse, jed_coarse)
+          allocate(x(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse, nz))
+          allocate(y(isd_coarse:ied_coarse, jsd_coarse:jed_coarse+shift, nz))
+          x = 0
+          y = 0
+          tile = my_tile_id(1)
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse
+                do i = isc_coarse, iec_coarse+shift
+                   x(i,j,k) = 1e3 + tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse+shift
+                do i = isc_coarse, iec_coarse
+                   y(i,j,k) = 2e3 + tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+       else
+          allocate(x(isd_fine:ied_fine+shift, jsd_fine:jed_fine, nz))
+          allocate(y(isd_fine:ied_fine, jsd_fine:jed_fine+shift, nz))
+          x = 0
+          y = 0
+          do k = 1, nz
+             do j = jsc_fine, jec_fine
+                do i = isc_fine, iec_fine+shift
+                   x(i,j,k) = i*1.e+6 + j*1.e+3 + k + 1e-3
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_fine, jec_fine+shift
+                do i = isc_fine, iec_fine
+                   y(i,j,k) = i*1.e+6 + j*1.e+3 + k + 2e-3
+                enddo
+             enddo
+          enddo
+       endif
+
+       if(is_fine_pe) then
+          if( iew_cx .GE. isw_cx .AND. jew_cx .GE. jsw_cx ) then
+             allocate(wbufferx(isw_cx:iew_cx, jsw_cx:jew_cx,nz))
+             allocate(wbuffery(isw_cy:iew_cy, jsw_cy:jew_cy,nz))
+             allocate(wbufferx2(isw_cx:iew_cx, jsw_cx:jew_cx,nz))
+             allocate(wbuffery2(isw_cy:iew_cy, jsw_cy:jew_cy,nz))
+          else
+             allocate(wbufferx(1,1,1))
+             allocate(wbuffery(1,1,1))
+             allocate(wbufferx2(1,1,1))
+             allocate(wbuffery2(1,1,1))
+          endif
+          if( iee_cx .GE. ise_cx .AND. jee_cx .GE. jse_cx ) then
+             allocate(ebufferx(ise_cx:iee_cx, jse_cx:jee_cx,nz))
+             allocate(ebuffery(ise_cy:iee_cy, jse_cy:jee_cy,nz))
+             allocate(ebufferx2(ise_cx:iee_cx, jse_cx:jee_cx,nz))
+             allocate(ebuffery2(ise_cy:iee_cy, jse_cy:jee_cy,nz))
+          else
+             allocate(ebufferx(1,1,1))
+             allocate(ebuffery(1,1,1))
+             allocate(ebufferx2(1,1,1))
+             allocate(ebuffery2(1,1,1))
+          endif
+          if( ies_cx .GE. iss_cx .AND. jes_cx .GE. jss_cx ) then
+             allocate(sbufferx(iss_cx:ies_cx, jss_cx:jes_cx,nz))
+             allocate(sbuffery(iss_cy:ies_cy, jss_cy:jes_cy,nz))
+             allocate(sbufferx2(iss_cx:ies_cx, jss_cx:jes_cx,nz))
+             allocate(sbuffery2(iss_cy:ies_cy, jss_cy:jes_cy,nz))
+          else
+             allocate(sbufferx(1,1,1))
+             allocate(sbuffery(1,1,1))
+             allocate(sbufferx2(1,1,1))
+             allocate(sbuffery2(1,1,1))
+          endif
+          if( ien_cx .GE. isn_cx .AND. jen_cx .GE. jsn_cx ) then
+             allocate(nbufferx(isn_cx:ien_cx, jsn_cx:jen_cx,nz))
+             allocate(nbuffery(isn_cy:ien_cy, jsn_cy:jen_cy,nz))
+             allocate(nbufferx2(isn_cx:ien_cx, jsn_cx:jen_cx,nz))
+             allocate(nbuffery2(isn_cy:ien_cy, jsn_cy:jen_cy,nz))
+          else
+             allocate(nbufferx(1,1,1))
+             allocate(nbuffery(1,1,1))
+             allocate(nbufferx2(1,1,1))
+             allocate(nbuffery2(1,1,1))
+          endif
+          wbufferx = 0; wbufferx2 = 0
+          wbuffery = 0; wbuffery2 = 0
+          sbufferx = 0; sbufferx2 = 0
+          sbuffery = 0; sbuffery2 = 0
+          ebufferx = 0; ebufferx2 = 0
+          ebuffery = 0; ebuffery2 = 0
+          nbufferx = 0; nbufferx2 = 0
+          nbuffery = 0; nbuffery2 = 0
+       endif
+       call mpp_update_nest_fine(x, y, nest_domain, wbufferx, wbuffery, sbufferx, sbuffery, ebufferx, ebuffery, &
+            nbufferx, nbuffery, nest_level=l, gridtype=CGRID_NE, flags=SCALAR_PAIR)
+
+       !--- compare with the assumed value.
+       if( is_fine_pe ) then
+          call mpp_set_current_pelist(my_pelist_fine)
+          if( iew_c .GE. isw_c .AND. jew_c .GE. jsw_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(wbufferx2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
+             call fill_nest_data(wbuffery2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
+          endif
+          if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(sbufferx2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
+             call fill_nest_data(sbuffery2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
+          endif
+          if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(ebufferx2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
+             call fill_nest_data(ebuffery2, ise_c, iee_c, jse_c, jee_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
+          endif
+          if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(nbufferx2, isn_c, ien_c, jsn_c, jen_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
+             call fill_nest_data(nbuffery2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
+          endif
+
+          call compare_checksums(wbufferx, wbufferx2, trim(type2)//' west buffer coarse to fine CGRID scalar pair X')
+          call compare_checksums(wbuffery, wbuffery2, trim(type2)//' west buffer coarse to fine CGRID scalar pair Y')
+          call compare_checksums(sbufferx, sbufferx2, trim(type2)//' south buffer coarse to fine CGRID scalar pair X')
+          call compare_checksums(sbuffery, sbuffery2, trim(type2)//' south buffer coarse to fine CGRID scalar pair Y')
+          call compare_checksums(ebufferx, ebufferx2, trim(type2)//' east buffer coarse to fine CGRID scalar pair X')
+          call compare_checksums(ebuffery, ebuffery2, trim(type2)//' east buffer coarse to fine CGRID scalar pair Y')
+          call compare_checksums(nbufferx, nbufferx2, trim(type2)//' north buffer coarse to fine CGRID scalar pair X')
+          call compare_checksums(nbuffery, nbuffery2, trim(type2)//' north buffer coarse to fine CGRID scalar pair Y')
+       endif
+
+       !---------------------------------------------------------------------------
+       !
+       !                    coarse to fine CGRID vector update
+       !
+       !---------------------------------------------------------------------------
+
+       if(is_coarse_pe) then
+          call mpp_get_compute_domain(domain_coarse, isc_coarse, iec_coarse, jsc_coarse, jec_coarse)
+          call mpp_get_data_domain(domain_coarse, isd_coarse, ied_coarse, jsd_coarse, jed_coarse)
+          x = 0
+          y = 0
+          tile = my_tile_id(1)
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse
+                do i = isc_coarse, iec_coarse+shift
+                   x(i,j,k) = 1e3 + tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse+shift
+                do i = isc_coarse, iec_coarse
+                   y(i,j,k) = 2e3 + tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+       else
+          x = 0
+          y = 0
+          do k = 1, nz
+             do j = jsc_fine, jec_fine
+                do i = isc_fine, iec_fine+shift
+                   x(i,j,k) = i*1.e+6 + j*1.e+3 + k + 1e-3
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_fine, jec_fine+shift
+                do i = isc_fine, iec_fine
+                   y(i,j,k) = i*1.e+6 + j*1.e+3 + k + 2e-3
+                enddo
+             enddo
+          enddo
+       endif
+
+       if(is_fine_pe) then
+          wbufferx = 0; wbufferx2 = 0
+          wbuffery = 0; wbuffery2 = 0
+          sbufferx = 0; sbufferx2 = 0
+          sbuffery = 0; sbuffery2 = 0
+          ebufferx = 0; ebufferx2 = 0
+          ebuffery = 0; ebuffery2 = 0
+          nbufferx = 0; nbufferx2 = 0
+          nbuffery = 0; nbuffery2 = 0
+       endif
+       call mpp_update_nest_fine(x, y, nest_domain, wbufferx, wbuffery, sbufferx, sbuffery, ebufferx, ebuffery, &
+            nbufferx, nbuffery, nest_level=l, gridtype=CGRID_NE)
+
+       !--- compare with the assumed value.
+       if( is_fine_pe ) then
+          call mpp_set_current_pelist(my_pelist_fine)
+          if( iew_c .GE. isw_c .AND. jew_c .GE. jsw_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(wbufferx2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
+             call fill_nest_data(wbuffery2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
+          endif
+          if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(sbufferx2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
+             call fill_nest_data(sbuffery2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
+          endif
+          if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(ebufferx2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
+             call fill_nest_data(ebuffery2, ise_c, iee_c, jse_c, jee_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
+          endif
+          if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(nbufferx2, isn_c, ien_c, jsn_c, jen_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
+             call fill_nest_data(nbuffery2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
+          endif
+
+          call compare_checksums(wbufferx, wbufferx2, trim(type2)//' west buffer coarse to fine CGRID vector X')
+          call compare_checksums(wbuffery, wbuffery2, trim(type2)//' west buffer coarse to fine CGRID vector Y')
+          call compare_checksums(sbufferx, sbufferx2, trim(type2)//' south buffer coarse to fine CGRID vector X')
+          call compare_checksums(sbuffery, sbuffery2, trim(type2)//' south buffer coarse to fine CGRID vector Y')
+          call compare_checksums(ebufferx, ebufferx2, trim(type2)//' east buffer coarse to fine CGRID vector X')
+          call compare_checksums(ebuffery, ebuffery2, trim(type2)//' east buffer coarse to fine CGRID vector Y')
+          call compare_checksums(nbufferx, nbufferx2, trim(type2)//' north buffer coarse to fine CGRID vector X')
+          call compare_checksums(nbuffery, nbuffery2, trim(type2)//' north buffer coarse to fine CGRID vector Y')
+       endif
+
+       if(allocated(x)) deallocate(x)
+       if(allocated(y)) deallocate(y)
+       if(is_fine_pe) then
+          deallocate(wbufferx, ebufferx, sbufferx, nbufferx)
+          deallocate(wbufferx2, ebufferx2, sbufferx2, nbufferx2)
+          deallocate(wbuffery, ebuffery, sbuffery, nbuffery)
+          deallocate(wbuffery2, ebuffery2, sbuffery2, nbuffery2)
+       endif
+
+
+       !---------------------------------------------------------------------------
+       !
+       !                    coarse to fine DGRID vector update
+       !
+       !---------------------------------------------------------------------------
+       shift = 1
+
+       if(is_coarse_pe) then
+          call mpp_get_compute_domain(domain_coarse, isc_coarse, iec_coarse, jsc_coarse, jec_coarse)
+          call mpp_get_data_domain(domain_coarse, isd_coarse, ied_coarse, jsd_coarse, jed_coarse)
+          allocate(y(isd_coarse:ied_coarse+shift, jsd_coarse:jed_coarse, nz))
+          allocate(x(isd_coarse:ied_coarse, jsd_coarse:jed_coarse+shift, nz))
+          x = 0
+          y = 0
+          tile = my_tile_id(1)
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse+shift
+                do i = isc_coarse, iec_coarse
+                   x(i,j,k) = 1e3 + tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_coarse, jec_coarse
+                do i = isc_coarse, iec_coarse+shift
+                   y(i,j,k) = 2e3 + tile + i*1.e-3 + j*1.e-6 + k*1.e-9
+                enddo
+             enddo
+          enddo
+       else
+          allocate(y(isd_fine:ied_fine+shift, jsd_fine:jed_fine, nz))
+          allocate(x(isd_fine:ied_fine, jsd_fine:jed_fine+shift, nz))
+          x = 0
+          y = 0
+          do k = 1, nz
+             do j = jsc_fine, jec_fine+shift
+                do i = isc_fine, iec_fine
+                   x(i,j,k) = i*1.e+6 + j*1.e+3 + k + 1e-3
+                enddo
+             enddo
+          enddo
+          do k = 1, nz
+             do j = jsc_fine, jec_fine
+                do i = isc_fine, iec_fine+shift
+                   y(i,j,k) = i*1.e+6 + j*1.e+3 + k + 2e-3
+                enddo
+             enddo
+          enddo
+       endif
+
+       if(is_fine_pe) then
+          call mpp_get_C2F_index(nest_domain, isw_fx, iew_fx, jsw_fx, jew_fx, isw_cx, iew_cx, jsw_cx, jew_cx, WEST, l, position=NORTH)
+          call mpp_get_C2F_index(nest_domain, ise_fx, iee_fx, jse_fx, jee_fx, ise_cx, iee_cx, jse_cx, jee_cx, EAST, l, position=NORTH)
+          call mpp_get_C2F_index(nest_domain, iss_fx, ies_fx, jss_fx, jes_fx, iss_cx, ies_cx, jss_cx, jes_cx, SOUTH, l, position=NORTH)
+          call mpp_get_C2F_index(nest_domain, isn_fx, ien_fx, jsn_fx, jen_fx, isn_cx, ien_cx, jsn_cx, jen_cx, NORTH, l, position=NORTH)
+          call mpp_get_C2F_index(nest_domain, isw_fy, iew_fy, jsw_fy, jew_fy, isw_cy, iew_cy, jsw_cy, jew_cy, WEST, l, position=EAST)
+          call mpp_get_C2F_index(nest_domain, ise_fy, iee_fy, jse_fy, jee_fy, ise_cy, iee_cy, jse_cy, jee_cy, EAST, l, position=EAST)
+          call mpp_get_C2F_index(nest_domain, iss_fy, ies_fy, jss_fy, jes_fy, iss_cy, ies_cy, jss_cy, jes_cy, SOUTH, l, position=EAST)
+          call mpp_get_C2F_index(nest_domain, isn_fy, ien_fy, jsn_fy, jen_fy, isn_cy, ien_cy, jsn_cy, jen_cy, NORTH, l, position=EAST)
+
+          if( iew_cx .GE. isw_cx .AND. jew_cx .GE. jsw_cx ) then
+             allocate(wbufferx(isw_cx:iew_cx, jsw_cx:jew_cx,nz))
+             allocate(wbuffery(isw_cy:iew_cy, jsw_cy:jew_cy,nz))
+             allocate(wbufferx2(isw_cx:iew_cx, jsw_cx:jew_cx,nz))
+             allocate(wbuffery2(isw_cy:iew_cy, jsw_cy:jew_cy,nz))
+          else
+             allocate(wbufferx(1,1,1))
+             allocate(wbuffery(1,1,1))
+             allocate(wbufferx2(1,1,1))
+             allocate(wbuffery2(1,1,1))
+          endif
+          if( iee_cx .GE. ise_cx .AND. jee_cx .GE. jse_cx ) then
+             allocate(ebufferx(ise_cx:iee_cx, jse_cx:jee_cx,nz))
+             allocate(ebuffery(ise_cy:iee_cy, jse_cy:jee_cy,nz))
+             allocate(ebufferx2(ise_cx:iee_cx, jse_cx:jee_cx,nz))
+             allocate(ebuffery2(ise_cy:iee_cy, jse_cy:jee_cy,nz))
+          else
+             allocate(ebufferx(1,1,1))
+             allocate(ebuffery(1,1,1))
+             allocate(ebufferx2(1,1,1))
+             allocate(ebuffery2(1,1,1))
+          endif
+          if( ies_cx .GE. iss_cx .AND. jes_cx .GE. jss_cx ) then
+             allocate(sbufferx(iss_cx:ies_cx, jss_cx:jes_cx,nz))
+             allocate(sbuffery(iss_cy:ies_cy, jss_cy:jes_cy,nz))
+             allocate(sbufferx2(iss_cx:ies_cx, jss_cx:jes_cx,nz))
+             allocate(sbuffery2(iss_cy:ies_cy, jss_cy:jes_cy,nz))
+          else
+             allocate(sbufferx(1,1,1))
+             allocate(sbuffery(1,1,1))
+             allocate(sbufferx2(1,1,1))
+             allocate(sbuffery2(1,1,1))
+          endif
+          if( ien_cx .GE. isn_cx .AND. jen_cx .GE. jsn_cx ) then
+             allocate(nbufferx(isn_cx:ien_cx, jsn_cx:jen_cx,nz))
+             allocate(nbuffery(isn_cy:ien_cy, jsn_cy:jen_cy,nz))
+             allocate(nbufferx2(isn_cx:ien_cx, jsn_cx:jen_cx,nz))
+             allocate(nbuffery2(isn_cy:ien_cy, jsn_cy:jen_cy,nz))
+          else
+             allocate(nbufferx(1,1,1))
+             allocate(nbuffery(1,1,1))
+             allocate(nbufferx2(1,1,1))
+             allocate(nbuffery2(1,1,1))
+          endif
+
+          wbufferx = 0; wbufferx2 = 0
+          wbuffery = 0; wbuffery2 = 0
+          sbufferx = 0; sbufferx2 = 0
+          sbuffery = 0; sbuffery2 = 0
+          ebufferx = 0; ebufferx2 = 0
+          ebuffery = 0; ebuffery2 = 0
+          nbufferx = 0; nbufferx2 = 0
+          nbuffery = 0; nbuffery2 = 0
+       endif
+       call mpp_update_nest_fine(x, y, nest_domain, wbufferx, wbuffery, sbufferx, sbuffery, ebufferx, ebuffery, &
+            nbufferx, nbuffery, nest_level=l, gridtype=DGRID_NE)
+
+       !--- compare with the assumed value.
+       if( is_fine_pe ) then
+          call mpp_set_current_pelist(my_pelist_fine)
+          if( iew_c .GE. isw_c .AND. jew_c .GE. jsw_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(wbufferx2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
+             call fill_nest_data(wbuffery2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
+          endif
+          if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(sbufferx2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
+             call fill_nest_data(sbuffery2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
+          endif
+          if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(ebufferx2, ise_c, iee_c, jse_c, jee_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
+             call fill_nest_data(ebuffery2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
+          endif
+          if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
+             call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
+                  nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
+             call fill_nest_data(nbufferx2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
+             call fill_nest_data(nbuffery2, isn_c, ien_c, jsn_c, jen_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
+          endif
 
-  end subroutine fill_nest_data
+          call compare_checksums(wbufferx, wbufferx2, trim(type2)//' west buffer coarse to fine DGRID vector X')
+          call compare_checksums(wbuffery, wbuffery2, trim(type2)//' west buffer coarse to fine DGRID vector Y')
+          call compare_checksums(sbufferx, sbufferx2, trim(type2)//' south buffer coarse to fine DGRID vector X')
+          call compare_checksums(sbuffery, sbuffery2, trim(type2)//' south buffer coarse to fine DGRID vector Y')
+          call compare_checksums(ebufferx, ebufferx2, trim(type2)//' east buffer coarse to fine DGRID vector X')
+          call compare_checksums(ebuffery, ebuffery2, trim(type2)//' east buffer coarse to fine DGRID vector Y')
+          call compare_checksums(nbufferx, nbufferx2, trim(type2)//' north buffer coarse to fine DGRID vector X')
+          call compare_checksums(nbuffery, nbuffery2, trim(type2)//' north buffer coarse to fine DGRID vector Y')
+       endif
 
-!###############################################################################
+       if(allocated(x)) deallocate(x)
+       if(allocated(y)) deallocate(y)
+       if(is_fine_pe) then
+          deallocate(wbufferx, ebufferx, sbufferx, nbufferx)
+          deallocate(wbufferx2, ebufferx2, sbufferx2, nbufferx2)
+          deallocate(wbuffery, ebuffery, sbuffery, nbuffery)
+          deallocate(wbuffery2, ebuffery2, sbuffery2, nbuffery2)
+       endif
+       endif
+       deallocate(my_pelist, my_pelist_fine)
+       call mpp_set_current_pelist()
+    enddo
 
+    call mpp_set_current_pelist()
+    deallocate(pelist)
 
-  subroutine test_update_nest_domain( type )
+  end subroutine test_update_nest_domain_r8
+  !###########################################################################
+  !# MZ
+  subroutine test_update_nest_domain_r4( type )
     character(len=*), intent(in) :: type
     logical                      :: cubic_grid
     logical                      :: is_fine_pe, is_coarse_pe
@@ -7734,20 +7849,20 @@ subroutine test_update_nest_domain( type )
     integer, allocatable         :: my_pelist_fine(:)
     integer, allocatable         :: pe_start(:), pe_end(:)
     integer, allocatable         :: layout2D(:,:), global_indices(:,:)
-    real,    allocatable         :: x(:,:,:), x1(:,:,:), x2(:,:,:)
-    real,    allocatable         :: y(:,:,:), y1(:,:,:), y2(:,:,:)
-    real,    allocatable         :: wbuffer(:,:,:), wbuffer2(:,:,:)
-    real,    allocatable         :: ebuffer(:,:,:), ebuffer2(:,:,:)
-    real,    allocatable         :: sbuffer(:,:,:), sbuffer2(:,:,:)
-    real,    allocatable         :: nbuffer(:,:,:), nbuffer2(:,:,:)
-    real,    allocatable         :: wbufferx(:,:,:), wbufferx2(:,:,:)
-    real,    allocatable         :: ebufferx(:,:,:), ebufferx2(:,:,:)
-    real,    allocatable         :: sbufferx(:,:,:), sbufferx2(:,:,:)
-    real,    allocatable         :: nbufferx(:,:,:), nbufferx2(:,:,:)
-    real,    allocatable         :: wbuffery(:,:,:), wbuffery2(:,:,:)
-    real,    allocatable         :: ebuffery(:,:,:), ebuffery2(:,:,:)
-    real,    allocatable         :: sbuffery(:,:,:), sbuffery2(:,:,:)
-    real,    allocatable         :: nbuffery(:,:,:), nbuffery2(:,:,:)
+    real(kind=r4_kind), allocatable :: x(:,:,:), x1(:,:,:), x2(:,:,:)
+    real(kind=r4_kind), allocatable :: y(:,:,:), y1(:,:,:), y2(:,:,:)
+    real(kind=r4_kind), allocatable :: wbuffer(:,:,:), wbuffer2(:,:,:)
+    real(kind=r4_kind), allocatable :: ebuffer(:,:,:), ebuffer2(:,:,:)
+    real(kind=r4_kind), allocatable :: sbuffer(:,:,:), sbuffer2(:,:,:)
+    real(kind=r4_kind), allocatable :: nbuffer(:,:,:), nbuffer2(:,:,:)
+    real(kind=r4_kind), allocatable :: wbufferx(:,:,:), wbufferx2(:,:,:)
+    real(kind=r4_kind), allocatable :: ebufferx(:,:,:), ebufferx2(:,:,:)
+    real(kind=r4_kind), allocatable :: sbufferx(:,:,:), sbufferx2(:,:,:)
+    real(kind=r4_kind), allocatable :: nbufferx(:,:,:), nbufferx2(:,:,:)
+    real(kind=r4_kind), allocatable :: wbuffery(:,:,:), wbuffery2(:,:,:)
+    real(kind=r4_kind), allocatable :: ebuffery(:,:,:), ebuffery2(:,:,:)
+    real(kind=r4_kind), allocatable :: sbuffery(:,:,:), sbuffery2(:,:,:)
+    real(kind=r4_kind), allocatable :: nbuffery(:,:,:), nbuffery2(:,:,:)
     integer                      :: x_refine(num_nest), y_refine(num_nest)
     integer                      :: istart_fine(num_nest), iend_fine(num_nest)
     integer                      :: jstart_fine(num_nest), jend_fine(num_nest)
@@ -8018,7 +8133,7 @@ subroutine test_update_nest_domain( type )
                 je_c = min(je_coarse(n),   jec_coarse)
                 if( tile == t_coarse(n) .AND. ie_c .GE. is_c .AND. je_c .GE. js_c ) then
                    call fill_coarse_data(x2, rotate_coarse(n), iadd_coarse(n), jadd_coarse(n), &
-                        is_c, ie_c, js_c, je_c, nz, isd_coarse, jsd_coarse, nx, ny, 0, 0, 0.001, 0.001, 1, 1, &
+                        is_c, ie_c, js_c, je_c, nz, isd_coarse, jsd_coarse, nx, ny, 0, 0, 0.001_8, 0.001_8, 1, 1, &
                         .false., .false., iend_coarse(1), jend_coarse(1) )
                 endif
              enddo
@@ -8156,7 +8271,7 @@ subroutine test_update_nest_domain( type )
           y = 0
           do k = 1, nz
              do j = js_cx, je_cx
-                do i = is_cx, ie_cx
+                Do i = is_cx, ie_cx
                    x(i,j,k) = i*1.e+6 + j*1.e+3 + k + 1.0E-6
                 enddo
              enddo
@@ -8471,7 +8586,7 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(wbuffer2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
           endif
           call compare_checksums(wbuffer, wbuffer2, trim(type2)//' west buffer coarse to fine scalar')
 
@@ -8479,7 +8594,7 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(sbuffer2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
           endif
           call compare_checksums(sbuffer, sbuffer2, trim(type2)//' south buffer coarse to fine scalar')
 
@@ -8487,7 +8602,7 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(ebuffer2, ise_c, iee_c, jse_c, jee_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
           endif
           call compare_checksums(ebuffer, ebuffer2, trim(type2)//' east buffer coarse to fine scalar')
 
@@ -8495,7 +8610,7 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(nbuffer2, isn_c, ien_c, jsn_c, jen_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
           endif
           call compare_checksums(nbuffer, nbuffer2, trim(type2)//' north buffer coarse to fine scalar')
        endif
@@ -8762,33 +8877,33 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(wbufferx2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
              call fill_nest_data(wbuffery2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
           endif
           if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(sbufferx2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
              call fill_nest_data(sbuffery2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
           endif
           if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(ebufferx2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1)
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
              call fill_nest_data(ebuffery2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1)
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
           endif
           if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(nbufferx2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 1e3, 2e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
              call fill_nest_data(nbuffery2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 2e3, 1e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
           endif
 
           call compare_checksums(wbufferx, wbufferx2, trim(type2)//' west buffer coarse to fine BGRID scalar pair X')
@@ -8888,7 +9003,7 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(wbuffer2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
           endif
           call compare_checksums(wbuffer, wbuffer2, trim(type2)//' west buffer coarse to fine scalar CORNER')
 
@@ -8896,7 +9011,7 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(sbuffer2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
           endif
           call compare_checksums(sbuffer, sbuffer2, trim(type2)//' south buffer coarse to fine scalar CORNER')
 
@@ -8904,7 +9019,7 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(ebuffer2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1)
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
           endif
           call compare_checksums(ebuffer, ebuffer2, trim(type2)//' east buffer coarse to fine scalar CORNER')
 
@@ -8912,7 +9027,7 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(nbuffer2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, shift, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 0.0, 0.0, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 0.0, 0.0, 1, 1, nx, ny)
           endif
           call compare_checksums(nbuffer, nbuffer2, trim(type2)//' north buffer coarse to fine scalar CORNER')
 
@@ -9181,33 +9296,33 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(wbufferx2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
              call fill_nest_data(wbuffery2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
           endif
           if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(sbufferx2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
              call fill_nest_data(sbuffery2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
           endif
           if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(ebufferx2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1)
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
              call fill_nest_data(ebuffery2, ise_c, iee_c, jse_c, jee_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
           endif
           if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(nbufferx2, isn_c, ien_c, jsn_c, jen_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, 1, nx, ny)
              call fill_nest_data(nbuffery2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 2e3, 1e3, 1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 2e3, 1e3, 1, 1, nx, ny)
           endif
 
           call compare_checksums(wbufferx, wbufferx2, trim(type2)//' west buffer coarse to fine CGRID scalar pair X')
@@ -9285,33 +9400,33 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(wbufferx2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
              call fill_nest_data(wbuffery2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
           endif
           if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(sbufferx2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
              call fill_nest_data(sbuffery2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
           endif
           if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(ebufferx2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1)
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
              call fill_nest_data(ebuffery2, ise_c, iee_c, jse_c, jee_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
           endif
           if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(nbufferx2, isn_c, ien_c, jsn_c, jen_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
              call fill_nest_data(nbuffery2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 2e3, 1e3, -1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
           endif
 
           call compare_checksums(wbufferx, wbufferx2, trim(type2)//' west buffer coarse to fine CGRID vector X')
@@ -9458,33 +9573,33 @@ subroutine test_update_nest_domain( type )
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isw_c/), (/iew_c/), (/jsw_c/), (/jew_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(wbufferx2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
              call fill_nest_data(wbuffery2, isw_c, iew_c, jsw_c, jew_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
           endif
           if( ies_c .GE. iss_c .AND. jes_c .GE. jss_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/iss_c/), (/ies_c/), (/jss_c/), (/jes_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(sbufferx2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, 0, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
              call fill_nest_data(sbuffery2, iss_c, ies_c, jss_c, jes_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
           endif
           if( iee_c .GE. ise_c .AND. jee_c .GE. jse_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/ise_c/), (/iee_c/), (/jse_c/), (/jee_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(ebufferx2, ise_c, iee_c, jse_c, jee_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
              call fill_nest_data(ebuffery2, ise_c+shift, iee_c, jse_c, jee_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1)
+                  rotate_coarse, is_coarse+shift, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
           endif
           if( ien_c .GE. isn_c .AND. jen_c .GE. jsn_c ) then
              call get_nnest2(domain_coarse, 1, tile_coarse(my_fine_id:my_fine_id), (/isn_c/), (/ien_c/), (/jsn_c/), (/jen_c/), &
                   nnest, t_coarse, iadd_coarse, jadd_coarse, rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse)
              call fill_nest_data(nbufferx2, isn_c, ien_c, jsn_c+shift, jen_c, nnest, t_coarse, 0, shift, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 1e3, 2e3, 1, -1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse+shift, je_coarse, 1e3, 2e3, 1, -1, nx, ny)
              call fill_nest_data(nbuffery2, isn_c, ien_c, jsn_c, jen_c, nnest, t_coarse, shift, 0, iadd_coarse, jadd_coarse, &
-                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1)
+                  rotate_coarse, is_coarse, ie_coarse, js_coarse, je_coarse, 2e3, 1e3, -1, 1, nx, ny)
           endif
 
           call compare_checksums(wbufferx, wbufferx2, trim(type2)//' west buffer coarse to fine DGRID vector X')
@@ -9513,7 +9628,8 @@ subroutine test_update_nest_domain( type )
     call mpp_set_current_pelist()
     deallocate(pelist)
 
-  end subroutine test_update_nest_domain
+  end subroutine test_update_nest_domain_r4
+
 
   !############################################################################
   !--- this routine will get number of nest.
@@ -9569,73 +9685,6 @@ subroutine convert_index_up(domain, rotate, ncross, is_coarse, ie_coarse, js_coa
   end subroutine convert_index_up
 
   !############################################################################
-  subroutine fill_coarse_data(data, rotate, iadd, jadd, is_c, ie_c, js_c, je_c, nz, isd, jsd, nx, ny, &
-                              ishift, jshift, x_add, y_add, sign1, sign2, x_cyclic, y_cyclic, ieg, jeg)
-    integer, intent(in)    :: rotate, is_c, ie_c, js_c, je_c, nz, isd, jsd, iadd, jadd, nx, ny, ishift, jshift
-    integer, intent(in)    :: sign1, sign2
-    real,    intent(inout) :: data(isd:, jsd:, :)
-    real,    intent(in)    :: x_add, y_add
-    logical, intent(in)    :: x_cyclic, y_cyclic
-    integer, intent(in)    :: ieg, jeg
-    integer :: i, j, k
-
-    select case (rotate)
-    case (ZERO)
-       ! convert the index to be consistent with the fine grid.
-       do k = 1, nz
-          do j = js_c, je_c+jshift
-             do i = is_c, ie_c+ishift
-                data(i,j,k) = (i+iadd)*1.e+6 + (j+jadd)*1.e+3 + k + x_add
-             enddo
-          enddo
-       enddo
-    case (NINETY)
-       ! convert the index to be consistent with the fine grid.
-       do k = 1, nz
-          do j = js_c, je_c+jshift
-             do i = is_c, ie_c+ishift
-                data(i,j,k) = sign1*((nx-j+1+iadd+jshift)*1.e+6 + (i+jadd)*1.e+3 + k + y_add)
-             enddo
-          enddo
-       enddo
-    case (MINUS_NINETY)
-       ! convert the index to be consistent with the fine grid.
-       do k = 1, nz
-          do j = js_c, je_c+jshift
-             do i = is_c, ie_c+ishift
-                data(i,j,k) = sign2*((j+iadd)*1.e+6 + (ny-i+1+jadd+ishift)*1.e+3 + k + y_add)
-             enddo
-          enddo
-       enddo
-    case default
-       call mpp_error(FATAL,"fill_coarse_data: rotate_coarse must be ZERO, NINETY, MINUS_NINETY")
-    end select
-
-    !---handle cyclic condition
-    if(x_cyclic) then
-       if(ie_c+ishift+iadd == ieg) then
-          i = ie_c+ishift
-          do k = 1, nz
-             do j = js_c, je_c+jshift
-                data(i,j,k) = i*1.e+6 + (j+jadd)*1.e+3 + k + x_add
-             enddo
-          enddo
-       endif
-    endif
-
-
-    if(y_cyclic) then
-       if(je_c+jshift+jadd == jeg) then
-          j = je_c+jshift
-          do k = 1, nz
-             do j = js_c, je_c+jshift
-                data(i,j,k) = (i+iadd)*1.e+6 + j*1.e+3 + k + x_add
-             enddo
-          enddo
-       endif
-    endif
-
-  end subroutine fill_coarse_data
 
   subroutine test_get_boundary_ad(type)
   use mpp_mod,         only : mpp_pe, mpp_npes, mpp_root_pe, mpp_sum
@@ -9802,260 +9851,6 @@ subroutine test_get_boundary_ad(type)
 
   end subroutine test_get_boundary_ad
 
-  subroutine test_halo_update_ad( type )
-  use mpp_mod,         only : mpp_pe, mpp_npes, mpp_root_pe, mpp_sum
-  use mpp_domains_mod, only : CGRID_NE
-  use mpp_domains_mod, only : mpp_update_domains, mpp_update_domains_ad
-
-    character(len=*), intent(in) :: type
-    type(domain2D) :: domain
-
-    integer              :: shift, i, j, k
-    logical              :: is_symmetry
-    integer              :: is, ie, js, je, isd, ied, jsd, jed, pe
-
-    real*8,  allocatable, dimension(:,:,:) :: x_ad, y_ad, x_fd, y_fd, x_save, y_save
-    real*8 :: ad_sum, fd_sum
-
-    if(index(type, 'symmetry') == 0) then
-       is_symmetry = .false.
-    else
-       is_symmetry = .true.
-    end if
-    select case(type)
-    case( 'Simple', 'Simple symmetry' )
-        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
-                                 shalo=shalo, nhalo=nhalo, name=type, symmetry = is_symmetry )
-    case( 'Cyclic', 'Cyclic symmetry' )
-        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,        &
-             shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN, &
-             name=type, symmetry = is_symmetry )
-    case default
-        call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type )
-    end select
-
-!set up x array
-    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
-    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
-
-    shift=1
-!---test 3d single fields----------------------------------------------------------
-    allocate( x_fd(isd:ied,jsd:jed,nz) )
-    allocate( x_ad(isd:ied,jsd:jed,nz) )
-    allocate( x_save(isd:ied,jsd:jed,nz) )
-    x_fd = 0.; x_ad = 0.; x_save = 0.
-
-    do k = 1,nz
-       do j = js,je
-          do i = is,ie
-             x_fd(i,j,k) = i*j
-          end do
-       end do
-    end do
-    x_save = x_fd
-
-!full update
-    call mpp_update_domains( x_fd, domain )
-
-    fd_sum = 0.
-    do k = 1,nz
-       do j = jsd,jed
-          do i = isd,ied
-             fd_sum = fd_sum + x_fd(i,j,k)*x_fd(i,j,k)
-          end do
-       end do
-    end do
-    call mpp_sum( fd_sum )
-
-    x_ad = x_fd
-    call mpp_update_domains_ad( x_ad, domain )
-
-    ad_sum = 0.
-    do k = 1,nz
-       do j = jsd,jed
-          do i = isd,ied
-             ad_sum = ad_sum + x_ad(i,j,k)*x_save(i,j,k)
-          end do
-       end do
-    end do
-    call mpp_sum( ad_sum )
-
-    pe = mpp_pe()
-    if( pe.EQ.mpp_root_pe() ) then
-       if (abs(ad_sum-fd_sum)/fd_sum.lt.1E-7) then
-           print*, "Passed Adjoint Dot Test: mpp_update_domains_ad(single 3D field)"
-       endif
-    endif
-
-    deallocate (x_ad, x_fd, x_save)
-
-!---test 3d vector fields----------------------------------------------------------
-    allocate( x_ad  (isd:ied+shift,jsd:jed  ,nz) )
-    allocate( x_fd  (isd:ied+shift,jsd:jed  ,nz) )
-    allocate( x_save(isd:ied+shift,jsd:jed  ,nz) )
-    allocate( y_ad  (isd:ied  ,jsd:jed+shift,nz) )
-    allocate( y_fd  (isd:ied  ,jsd:jed+shift,nz) )
-    allocate( y_save(isd:ied  ,jsd:jed+shift,nz) )
-
-    x_fd=0; y_fd=0
-    do k = 1,nz
-      do j = js,je
-        do i = is,ie
-           x_fd(i,j,k)=i*j
-           y_fd(i,j,k)=i*j
-        end do
-      end do
-    end do
-
-    call mpp_update_domains( x_fd, y_fd, domain, gridtype=CGRID_NE)
-    x_save=x_fd
-    y_save=y_fd
-
-    fd_sum = 0.
-    do k = 1,nz
-      do j = jsd,jed
-        do i = isd,ied+shift
-           fd_sum = fd_sum + x_fd(i,j,k)*x_fd(i,j,k)
-        end do
-      end do
-    end do
-    do k = 1,nz
-      do j = jsd,jed+shift
-        do i = isd,ied
-           fd_sum = fd_sum + y_fd(i,j,k)*y_fd(i,j,k)
-        end do
-      end do
-    end do
-    call mpp_sum( fd_sum )
-
-    x_ad = x_fd
-    y_ad = y_fd
-    call mpp_update_domains_ad( x_ad, y_ad, domain, gridtype=CGRID_NE)
-
-    ad_sum = 0.
-    do k = 1,nz
-      do j = jsd,jed
-        do i = isd,ied+shift
-           ad_sum = ad_sum + x_ad(i,j,k)*x_save(i,j,k)
-        end do
-      end do
-    end do
-    do k = 1,nz
-      do j = jsd,jed+shift
-        do i = isd,ied
-           ad_sum = ad_sum + y_ad(i,j,k)*y_save(i,j,k)
-        end do
-      end do
-    end do
-    call mpp_sum( ad_sum )
-
-    if( pe.EQ.mpp_root_pe() ) then
-       if (abs(ad_sum-fd_sum)/fd_sum.lt.1E-7) then
-           print*, "Passed Adjoint Dot Test: mpp_update_domains_ad(vector 3D fields)"
-       endif
-    endif
-    deallocate (x_ad, y_ad, x_fd, y_fd, x_save, y_save)
-
-  end subroutine test_halo_update_ad
-
-  subroutine test_global_reduce_ad (type)
-  use mpp_mod,         only : mpp_pe, mpp_npes, mpp_root_pe, mpp_sum
-  use mpp_domains_mod, only : mpp_global_sum_tl, mpp_global_sum_ad
-    character(len=*), intent(in) :: type
-    real    :: gsum_tl, gsum_ad
-    real*8  :: gsum_tl_save, gsum_ad_save
-    real    :: gsum_tl_bit, gsum_ad_bit
-    real*8  :: gsum_tl_save_bit, gsum_ad_save_bit
-    integer :: i,j,k, ishift, jshift, position
-    integer :: isd, ied, jsd, jed
-
-    type(domain2D) :: domain
-    real, allocatable, dimension(:,:,:) :: x, x_ad, x_ad_bit
-
-    !--- set up domain
-    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
-    select case(type)
-    case( 'Simple' )
-           call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
-                                    shalo=shalo, nhalo=nhalo, name=type )
-    case( 'Simple symmetry center', 'Simple symmetry corner', 'Simple symmetry east', 'Simple symmetry north' )
-           call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
-                                    shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
-    case( 'Cyclic symmetry center', 'Cyclic symmetry corner', 'Cyclic symmetry east', 'Cyclic symmetry north' )
-           call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
-                                    name=type, symmetry = .true., xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN )
-    case default
-        call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
-    end select
-
-    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
-
-    !--- determine if an extra point is needed
-    ishift = 0; jshift = 0; position = CENTER
-    select case(type)
-    case ('Simple symmetry corner', 'Cyclic symmetry corner')
-       ishift = 1; jshift = 1; position = CORNER
-    case ('Simple symmetry east', 'Cyclic symmetry east' )
-       ishift = 1; jshift = 0; position = EAST
-    case ('Simple symmetry north', 'Cyclic symmetry north')
-       ishift = 0; jshift = 1; position = NORTH
-    end select
-
-    ied = ied+ishift; jed = jed+jshift
-
-    allocate( x(isd:ied,jsd:jed,nz), x_ad(isd:ied,jsd:jed,nz), x_ad_bit(isd:ied,jsd:jed,nz) )
-
-    x=0.
-    do k = 1,nz
-       do j = jsd, jed
-         do i = isd, ied
-           x(i,j,k) = i+j+k
-         enddo
-       enddo
-    enddo
-
-    gsum_tl      = mpp_global_sum( domain, x, position = position  )
-    gsum_tl_bit  = mpp_global_sum( domain, x, flags=BITWISE_EXACT_SUM  )
-    gsum_tl_save = gsum_tl*gsum_tl
-    gsum_tl_save_bit = gsum_tl_bit*gsum_tl_bit
-
-    gsum_ad      = gsum_tl
-    gsum_ad_bit  = gsum_tl_bit
-
-    x_ad     = 0.
-    x_ad_bit = 0.
-    call mpp_global_sum_ad( domain, x_ad, gsum_ad, position = position )
-    call mpp_global_sum_ad( domain, x_ad_bit, gsum_ad_bit, flags = BITWISE_EXACT_SUM )
-
-    gsum_ad_save     = 0.
-    gsum_ad_save_bit = 0.
-
-    do k = 1,nz
-       do j = jsd, jed
-         do i = isd, ied
-           gsum_ad_save     = gsum_ad_save + x_ad(i,j,k)*x(i,j,k)
-           gsum_ad_save_bit = gsum_ad_save_bit + x_ad_bit(i,j,k)*x(i,j,k)
-         enddo
-       enddo
-    enddo
-
-    call mpp_sum( gsum_ad_save )
-    call mpp_sum( gsum_ad_save_bit )
-
-    pe = mpp_pe()
-    if( pe.EQ.mpp_root_pe() ) then
-       if (abs(gsum_ad_save-gsum_tl_save)/gsum_tl_save.lt.1E-7) then
-           print*, "Passed Adjoint Dot Test: mpp_global_sum_ad"
-       endif
-       if (abs(gsum_ad_save_bit-gsum_tl_save_bit)/gsum_tl_save_bit.lt.1E-7) then
-           print*, "Passed Adjoint Dot Test: mpp_global_sum_ad, flags=BITWISE_EXACT_SUM"
-       endif
-    endif
-
-    deallocate(x, x_ad, x_ad_bit)
-
-  end subroutine test_global_reduce_ad
 
 end program test_mpp_domains
+
diff --git a/test_fms/mpp/test_mpp_domains2.sh b/test_fms/mpp/test_mpp_domains2.sh
index f6eee351f9..bdf9761cb7 100755
--- a/test_fms/mpp/test_mpp_domains2.sh
+++ b/test_fms/mpp/test_mpp_domains2.sh
@@ -39,6 +39,7 @@ touch input.nml # Input.nml is required to run the following tests
 echo "1: Test simple functionality"
 run_test test_domains_simple 4
 
+# Test update nest domains seen to work reliably with intel compiler 20 and 16 PEs.
 #echo "1: Test update nest domain"
 #sed "s/test_nest = .false./test_nest = .true./" $top_srcdir/test_fms/mpp/input_base.nml > input.nml
 run_test test_mpp_domains 2 skip
diff --git a/test_fms/mpp/test_mpp_gatscat b/test_fms/mpp/test_mpp_gatscat
new file mode 100755
index 0000000000..467e9c8501
--- /dev/null
+++ b/test_fms/mpp/test_mpp_gatscat
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_gatscat - temporary wrapper script for .libs/test_mpp_gatscat
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_gatscat program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_gatscat:test_mpp_gatscat:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_gatscat:test_mpp_gatscat:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_gatscat:test_mpp_gatscat:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_gatscat'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_gatscat.F90 b/test_fms/mpp/test_mpp_gatscat.F90
new file mode 100644
index 0000000000..47ff6cf81c
--- /dev/null
+++ b/test_fms/mpp/test_mpp_gatscat.F90
@@ -0,0 +1,900 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+#ifdef SYSTEM_CLOCK
+#undef SYSTEM_CLOCK
+#endif
+
+!> @author Miguel Zuniga
+!> @brief Test various mpp_gather and mpp_routines.
+!> @note  Some of the tested mpp_gather routines are legavy routines originally in file test_mpp.F90.
+!> @todo  Routine test_gather_2DV is a legacy routine with legacy issues. See associated comments.
+program test_mpp_gatscat
+
+#ifdef sgi_mipspro
+  use shmem_interface
+#endif
+
+  use mpp_mod, only : mpp_init, mpp_exit, mpp_pe, mpp_npes, mpp_root_pe, stdout
+  use mpp_mod, only : mpp_sync
+  use mpp_mod, only : mpp_declare_pelist, mpp_set_current_pelist, mpp_set_stack_size
+  use mpp_mod, only : mpp_gather, mpp_scatter, mpp_error, FATAL
+  use mpp_io_mod, only: mpp_io_init, mpp_flush
+  use mpp_mod, only : mpp_init_test_requests_allocated
+  use platform_mod
+
+#ifdef use_MPI_GSM
+  use mpp_mod, only : mpp_gsm_free
+#endif
+
+  implicit none
+
+  integer, parameter              :: n=1048576
+  real, allocatable, dimension(:) :: a, b, c
+#ifdef use_MPI_GSM
+  real                            :: d(n)
+  pointer (locd, d)
+#else
+  real, allocatable, dimension(:) :: d
+  integer(kind=i8_kind) :: locd
+#endif
+  integer                         :: pe, npes, root, istat
+  integer                         :: out_unit
+  real                            :: dt
+  integer                         :: ierr
+
+  call mpp_init(mpp_init_test_requests_allocated)
+  call mpp_io_init()
+  call mpp_set_stack_size(3145746)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  root = mpp_root_pe()
+  out_unit = stdout()
+
+  if( pe.EQ.root ) print *, '------------------> Calling test_scatter <------------------'
+  call test_scatter_2D(npes,pe,root,out_unit)
+  call test_scatter_3D(npes,pe,root,out_unit)
+  if( pe.EQ.root ) print *, '------------------> Finished test_scatter <------------------'
+
+  if( pe.EQ.root ) print *, '------------------> Calling test_gather <------------------'
+  call test_gather(npes,pe,root,out_unit)
+  call test_gatherV(npes,pe,root,out_unit)
+
+  !!test_gather_2DV does not always work and does not make sense.
+  !call test_gather2DV(npes,pe,root,out_unit)
+
+  if( pe.EQ.root ) print *, '------------------> Finished test_gather <------------------'
+
+  call MPI_finalize(ierr)
+
+
+contains
+
+!> @brief    Call some of the type specific (Float vs double) test_scatter_2D routines.
+  subroutine test_scatter_2D(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    if(npes < 3)then
+       call mpp_error(FATAL, "Test_scatter_2D: minimum of 3 ranks required. Not testing gather; too few ranks.")
+    endif
+    write(out_unit,*)
+
+    call test_scatter_2D_R4(npes, pe, root, out_unit)
+
+    call test_scatter_2D_R8(npes, pe, root, out_unit)
+
+  end subroutine test_scatter_2D
+
+
+!> @brief Call some of the type specific (Float vs double) test_scatter_3D routines.
+  subroutine test_scatter_3D(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    if(npes < 3)then
+       call mpp_error(FATAL, "Test_scatter_3D: minimum of 3 ranks required. Not testing gather; too few ranks.")
+    endif
+    write(out_unit,*)
+
+    call test_scatter_3D_R4(npes, pe, root, out_unit)
+
+    call test_scatter_3D_R8(npes, pe, root, out_unit)
+
+  end subroutine test_scatter_3D
+
+
+  !> @brief Test the mpp_scatter functions with FLOAT_KIND data arguments.
+  subroutine test_scatter_2D_R4(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer :: i,j,k
+    real(kind=r4_kind), allocatable, dimension(:,:)  ::  data     !!Data to be scattered
+    real(kind=r4_kind), allocatable, dimension(:,:)  ::  segment
+    integer :: DS, SS  !!Source data size and segment size
+    integer :: iz, jz  !!The zeroth element to be scattered is at pos data(is+iz, js+jz)
+    integer :: is, ie, js, je !!The amount of data to be scattered is (ie - is)*(je - js)
+    integer :: id, jd
+
+    DS = 7 !! DS should be less than 10 for the tests below to make sense.
+    SS = 6
+    allocate(data(DS, DS))
+    allocate(segment(SS, SS))
+
+    !!The full PE list [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1
+    segment = -2.0
+    !! Re-initialize data  on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4) is 34.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             data(i,j) = i*10 + j
+          enddo
+       enddo
+       !! And re-initalize segment on the root pe.
+       do i = 1,SS
+          do j = 1,SS
+             segment(i,j) = i * 10 + j
+          enddo
+       enddo
+    endif
+
+    !! Scatter from the source pe a subset of the data array.
+    !! The subset is to go into the segment array of the target pes.
+    !! The data to scatter is "moved" in a 1D array of size
+    !! S=(ie - is) * (je - js) and starts with the data at
+    !! position (iz,jz). Recall Fortran is column-major order.
+    iz = 2
+    jz = 3
+    is = 2
+    ie = 3
+    js = 2
+    je = 3
+    if(pe .eq. root) then
+       call mpp_scatter(is, ie, js, je, pelist(1:npes-1), segment, data, .true., iz, jz)
+    else
+       call mpp_scatter(is, ie, js, je, pelist(1:npes -1), segment, data, .false., iz, jz)
+    endif
+
+    call mpp_sync() !
+
+
+    !! Verify that the segment array has been updated on the target pes (i,e, those
+    !! in the pelist, which does not include pe numbered npes)
+    if(ANY(pe == pelist(1:npes-1))) then
+       i = 1
+       j = 1
+       !!Note below row (id index of "data() equivalent or formula") changing fastest.
+       do jd = js + jz, je + jz
+          do id = is + iz, ie + iz
+             if (segment(i,j) /= ( id * 10 + jd )) then
+                !!write(6,*) i, j, id, jd
+                call mpp_error(FATAL, "Test scatter 2D R4 failed in general scatter section.")
+             endif
+             !! Do to the next data element in segment
+             !! If just done the bottom element of a column:
+             if(i == SS) then
+                i = is
+                j = MOD(j + 1, SS) ! next column of segement()
+             else
+                i = i + 1 ! next row of segemnt()
+             endif
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+    write(out_unit,*) "Test test_scatter_2D_R4  successful at general scatter section."
+
+    !!Verify that the last pe (numbered npes) did not get the segment array updated!
+    if(pe == pelist(npes)) then
+       do i = 1,SS
+          do j = 1,SS
+             if (segment(i,j) /= -2 ) then
+                call mpp_error(FATAL, "Test scatter 2D failed. pe=npes segment was changed")
+             endif
+          end do
+       end do
+    endif
+
+    call mpp_sync() !
+    write(out_unit,*) "Test test_scatter_2D_R4  successful ."
+
+end subroutine test_scatter_2D_R4
+
+  !> @brief Test the mpp_scatter functions with DOUBLE_KIND data arguments.
+  subroutine test_scatter_2D_R8(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer :: i,j,k
+    real(kind=r8_kind), allocatable, dimension(:,:)  ::  data     !!Data to be scattered
+    real(kind=r8_kind), allocatable, dimension(:,:)  ::  segment
+    integer :: DS, SS  !!Source data size and segment size
+    integer :: iz, jz  !!The zeroth element to be scattered is at pos data(is+iz, js+jz)
+    integer :: is, ie, js, je !!The amount of data to be scattered is (ie - is)*(je - js)
+    integer :: id, jd
+
+
+    DS = 7 !! DS should be less than 10 for the tests below to make sense.
+    SS = 6
+    allocate(data(DS, DS))
+    allocate(segment(SS, SS))
+
+    !!The full PE list [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1
+    segment = -2.0
+    !! Re-initialize data  on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4) is 34.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             data(i,j) = i*10 + j
+          enddo
+       enddo
+       !! And re-initalize segment on the root pe.
+       do i = 1,SS
+          do j = 1,SS
+             segment(i,j) = i * 10 + j
+          enddo
+       enddo
+    endif
+
+    !! Scatter from the source pe a subset of the data array.
+    !! The subset is to go into the segment array of the target pes.
+    !! The data to scatter is "moved" in a 1D array of size
+    !! S=(ie - is) * (je - js) and starts with the data at
+    !! position (iz,jz). Recall Fortran is column-major order.
+    iz = 2
+    jz = 3
+    is = 2
+    ie = 3
+    js = 2
+    je = 3
+    if(pe .eq. root) then
+       call mpp_scatter(is, ie, js, je, pelist(1:npes-1), segment, data, .true., iz, jz)
+    else
+       call mpp_scatter(is, ie, js, je, pelist(1:npes -1), segment, data, .false., iz, jz)
+    endif
+
+    call mpp_sync()
+
+
+    !! Verify that the segment array has been updated on the target pes (i,e, those
+    !! in the pelist, which does not include pe numbered npes)
+    if(ANY(pe == pelist(1:npes-1))) then
+       i = 1
+       j = 1
+       !!Note below row (id index of "data() equivalent or formula") changing fastest.
+       do jd = js + jz, je + jz
+          do id = is + iz, ie + iz
+             if (segment(i,j) /= ( id * 10 + jd )) then
+                !!write(6,*) i, j, id, jd
+                call mpp_error(FATAL, "Test scatter 2D R8 failed in general scatter section.")
+             endif
+             !! Do to the next data element in segment
+             !! If just done the bottom element of a column:
+             if(i == SS) then
+                i = is
+                j = MOD(j + 1, SS) ! next column of segement()
+             else
+                i = i + 1 ! next row of segemnt()
+             endif
+          enddo
+       enddo
+    endif
+
+    call mpp_sync()
+    write(out_unit,*) "Test test_scatter_2D_R8  successful at general scatter section."
+
+    !!Verify that the last pe (numbered npes) did not get the segment array updated!
+    if(pe == pelist(npes)) then
+       do i = 1,SS
+          do j = 1,SS
+             if (segment(i,j) /= -2 ) then
+                call mpp_error(FATAL, "Test scatter 2D R8failed. pe=npes segment was changed")
+             endif
+          end do
+       end do
+    endif
+
+    call mpp_sync()
+    write(out_unit,*) "Test test_scatter_2D_R8  successful ."
+
+end subroutine test_scatter_2D_R8
+
+!> @brief Test the mpp_scatter 3D functions with FLOAT_KIND data arguments.
+  subroutine test_scatter_3D_R4(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer :: i,j,k
+    real(kind=r4_kind), allocatable, dimension(:,:,:)  ::  data     !!Data to be scattered
+    real(kind=r4_kind), allocatable, dimension(:,:,:)  ::  segment
+    integer :: DS, SS  !!Source data size and segment size
+    integer :: iz, jz  !!The zeroth element to be scattered is at pos data(is+iz, js+jz)
+    integer :: is, ie, js, je !!The amount of data to be scattered is (ie - is)*(je - js)
+    integer :: id, jd, kd
+    integer :: NZ
+    integer :: dAmount, dCount
+
+    NZ = 11 !! Depth of the square tube to be scattered.
+    DS = 6 !! DS should be less than 10 for the tests below to make sense.
+    SS = 5 !! Can be different that DS, but see retrictions.
+    allocate(data(DS, DS, NZ))
+    allocate(segment(SS, SS, NZ))
+
+    !!The full PE list is [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1
+    segment = -2.0
+    !! Re-initialize data  on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4,5) is 543.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             do k = 1,NZ
+             data(i,j, k) = k*100 + j*10 + i
+          enddo
+       enddo
+       enddo
+       !! And re-initalize segment on the root pe.
+       do i = 1,SS
+          do j = 1,SS
+             do k = 1,NZ
+             segment(i,j, k) = data(i,j, k)
+          enddo
+       enddo
+       enddo
+    endif
+
+    !! Scatter from the source pe a subset of the data array.
+    !! The subset is to go into the segment array of the target pes.
+    !! The data to scatter is "moved" in a 1D array of size
+    !! S=((ie - is +1) * (je - js + 1) * NZ )and starts with the data at
+    !! position (iz,jz, kz). Recall Fortran is column-major order.
+    iz = 2
+    jz = 2
+    is = 2
+    ie = 3
+    js = 2
+    je = 3
+    if(pe .eq. root) then
+       call mpp_scatter(is, ie, js, je, NZ, pelist(1:npes-1), segment, data, .true., iz, jz)
+    else
+       call mpp_scatter(is, ie, js, je, NZ, pelist(1:npes -1), segment, data, .false., iz, jz)
+    endif
+
+    call mpp_sync()
+
+    !! Verify that the segment array has been updated on the target pes (i,e, those
+    !! in the pelist, which does not include pe numbered npes)
+    !! dAmount is the number of data elements that should be scattered.
+    dAmount = (ie - is + 1)*(je -js + 1)*NZ
+    dCount = 0;
+
+    if(ANY(pe == pelist(1:npes-1))) then
+       kd = 1
+       jd = js + jz !(4,5)
+       id = is + iz !!increases fastest (4,5)
+       !!Note below row (id index of "data() equivalent or formula") changing fastest.
+       do k = 1,  NZ
+          do j = 1, SS !(je -js + 1)
+             do i = 1, SS !(ie - is + 1)
+                if(dCount < dAmount) then
+                   dCount = dCount + 1
+                   !!write(6,*) k, j, i, kd, jd, id
+                   if (segment(i,j, k) /= ( kd * 100 + jd*10 + id )) then
+                      call mpp_error(FATAL, "Test scatter 3D R4 failed - basic copy area.")
+                   endif
+                   !! Do to the next data element in segment
+                   !!IF the previous one was the corner of a square:
+                   if((id == ie + iz  ) .AND.  (jd == (je + jz))) then
+                      id = is + iz
+                      jd = js + jz
+                      kd = kd + 1
+                      !!IF the previous one was the botton of a column
+                   else if( id == ie + iz  ) then
+                      id = is + iz
+                      jd = jd + 1
+                   else
+                      id = id + 1 ! next row of segemnt()
+                   endif
+                endif
+             enddo
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+    write(out_unit,*) "Test test_scatter_2D_R4  successful at general scatter section."
+
+    !!Verify that the last pe (numbered npes) did not get the segment array updated!
+    if(pe == pelist(npes)) then
+       do i = 1,SS
+          do j = 1,SS
+             do k = 1,  NZ
+             if (segment(i,j, k) /= -2 ) then
+                call mpp_error(FATAL, "Test scatter 3D R4 failed. pe=npes segment was changed")
+             endif
+          end do
+       end do
+       enddo
+    endif
+
+    call mpp_sync()
+    write(out_unit,*) "Test scatter 3D R4 successful."
+
+  end subroutine test_scatter_3D_R4
+
+
+  !> @brief Test the mpp_scatter 3D functions with DOUBLE_KIND data arguments.
+  subroutine test_scatter_3D_R8(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer :: i,j,k
+    real(kind=r8_kind), allocatable, dimension(:,:,:)  ::  data     !!Data to be scattered
+    real(kind=r8_kind), allocatable, dimension(:,:,:)  ::  segment
+    integer :: DS, SS  !!Source data size and segment size
+    integer :: iz, jz  !!The zeroth element to be scattered is at pos data(is+iz, js+jz)
+    integer :: is, ie, js, je !!The amount of data to be scattered is (ie - is)*(je - js)
+    integer :: id, jd, kd
+    integer :: NZ
+    integer :: dAmount, dCount
+
+    NZ = 11 !! Depth of the square tube to be scattered.
+    DS = 6 !! DS should be less than 10 for the tests below to make sense.
+    SS = 5 !! Can be different that DS, but see retrictions.
+    allocate(data(DS, DS, NZ))
+    allocate(segment(SS, SS, NZ))
+
+    !!The full PE list is [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1
+    segment = -2.0
+    !! Re-initialize data  on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4,5) is 543.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             do k = 1,NZ
+             data(i,j, k) = k*100 + j*10 + i
+          enddo
+       enddo
+       enddo
+       !! And re-initalize segment on the root pe.
+       do i = 1,SS
+          do j = 1,SS
+             do k = 1,NZ
+             segment(i,j, k) = data(i,j, k)
+          enddo
+       enddo
+       enddo
+    endif
+
+    !! Scatter from the source pe a subset of the data array.
+    !! The subset is to go into the segment array of the target pes.
+    !! The data to scatter is "moved" in a 1D array of size
+    !! S=((ie - is +1) * (je - js + 1) * NZ )and starts with the data at
+    !! position (iz,jz, kz). Recall Fortran is column-major order.
+    iz = 2
+    jz = 2
+    is = 2
+    ie = 3
+    js = 2
+    je = 3
+    if(pe .eq. root) then
+       call mpp_scatter(is, ie, js, je, NZ, pelist(1:npes-1), segment, data, .true., iz, jz)
+    else
+       call mpp_scatter(is, ie, js, je, NZ, pelist(1:npes -1), segment, data, .false., iz, jz)
+    endif
+
+    call mpp_sync()
+
+    !! Verify that the segment array has been updated on the target pes (i,e, those
+    !! in the pelist, which does not include pe numbered npes)
+    !! dAmount is the number of data elements that should be scattered.
+    dAmount = (ie - is + 1)*(je -js + 1)*NZ
+    dCount = 0;
+
+    if(ANY(pe == pelist(1:npes-1))) then
+       kd = 1
+       jd = js + jz !(4,5)
+       id = is + iz !!increases fastest (4,5)
+       !!Note below row (id index of "data() equivalent or formula") changing fastest.
+       do k = 1,  NZ
+          do j = 1, SS !(je -js + 1)
+             do i = 1, SS !(ie - is + 1)
+                if(dCount < dAmount) then
+                   dCount = dCount + 1
+                   !!write(6,*) k, j, i, kd, jd, id
+                   if (segment(i,j, k) /= ( kd * 100 + jd*10 + id )) then
+                      call mpp_error(FATAL, "Test scatter 3D R8 failed - basic copy area.")
+                   endif
+                   !! Do to the next data element in segment
+                   !!IF the previous one was the corner of a square:
+                   if((id == ie + iz  ) .AND.  (jd == (je + jz))) then
+                      id = is + iz
+                      jd = js + jz
+                      kd = kd + 1
+                      !!IF the previous one was the botton of a column
+                   else if( id == ie + iz  ) then
+                      id = is + iz
+                      jd = jd + 1
+                   else
+                      id = id + 1 ! next row of segemnt()
+                   endif
+                endif
+             enddo
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+    write(out_unit,*) "Test test_scatter_2D_R8  successful at general scatter section."
+
+    !!Verify that the last pe (numbered npes) did not get the segment array updated!
+    if(pe == pelist(npes)) then
+       do i = 1,SS
+          do j = 1,SS
+             do k = 1,  NZ
+             if (segment(i,j, k) /= -2 ) then
+                call mpp_error(FATAL, "Test scatter 3D R8 failed. pe=npes segment was changed")
+             endif
+          end do
+       end do
+       enddo
+    endif
+
+    call mpp_sync()
+    write(out_unit,*) "Test scatter 3D R8 successful."
+
+  end subroutine test_scatter_3D_R8
+
+
+
+  !> @brief  Call some of the type specific (Float vs double) test_gather routines.
+    subroutine test_gather(npes,pe,root,out_unit)
+     integer, intent(in) :: npes,pe,root,out_unit
+
+     if(npes < 3)then
+       call mpp_error(FATAL, "Test_gather: minimum of 3 ranks required. Not testing gather; too few ranks.")
+     endif
+     write(out_unit,*)
+
+     call test_gather_R4(npes, pe, root, out_unit)
+
+     call test_gather_R8(npes, pe, root, out_unit)
+
+   end subroutine test_gather
+
+  !> @brief  Test the scalar mpp_gather routine with FLOAT_KIND data.
+     subroutine test_gather_R4(npes,pe,root,out_unit)
+     integer, intent(in) :: npes,pe,root,out_unit
+
+     integer :: pelist(npes)
+     integer :: i
+     real(kind=r4_kind) :: rdata(npes)
+     real(kind=r4_kind) :: val
+
+     if(npes < 3)then
+       call mpp_error(FATAL, "Test_gather: minimum of 3 ranks required. Not testing gather; too few ranks.")
+     endif
+     write(out_unit,*)
+
+     val = pe
+     rdata = -1.0
+     do i=1,npes
+       pelist(i) = i-1
+     enddo
+
+     call mpp_gather((/val/),rdata)
+     if(pe == root)then
+       do i=1,npes
+        if(INT(rdata(i)) /= pelist(i))then
+           write(6,*) "Gathered data ",INT(rdata(i)), " NE reference ",pelist(i), "at i=",i
+           call mpp_error(FATAL, "Test gather R4 uniform vector with global pelist failed")
+        endif
+       enddo
+     endif
+
+     call mpp_sync()
+     write(out_unit,*) "Test gather uniform vector with global pelist successful"
+
+     rdata = -1.0
+     if(ANY(pe == pelist(2:npes)))call mpp_gather((/val/),rdata(2:npes),pelist(2:npes))
+     if(pe == pelist(2))then
+       do i=2,npes
+        if(INT(rdata(i)) /= pelist(i))then
+           write(6,*) "Gathered data ",INT(rdata(i)), " NE reference ",pelist(i), "at i=",i
+           call mpp_error(FATAL, "Test gather R4 uniform vector with reduced pelist failed")
+        endif
+       enddo
+     endif
+     call mpp_sync()
+     write(out_unit,*) "Test gather uniform vector with reduced pelist successful"
+
+  end subroutine test_gather_R4
+
+
+  !> @brief  Test the scalar mpp_gather routine with DOUBLE_KIND data.
+    subroutine test_gather_R8(npes,pe,root,out_unit)
+     integer, intent(in) :: npes,pe,root,out_unit
+
+     integer :: pelist(npes)
+     integer :: i
+     real(kind=r8_kind) :: rdata(npes)
+     real(kind=r8_kind) :: val
+
+     if(npes < 3)then
+       call mpp_error(FATAL, "Test_gather: minimum of 3 ranks required. Not testing gather; too few ranks.")
+     endif
+     write(out_unit,*)
+
+     val = pe
+     rdata = -1.0
+     do i=1,npes
+       pelist(i) = i-1
+     enddo
+
+     call mpp_gather((/val/),rdata)
+     if(pe == root)then
+       do i=1,npes
+        if(INT(rdata(i)) /= pelist(i))then
+           write(6,*) "Gathered data ",INT(rdata(i)), " NE reference ",pelist(i), "at i=",i
+           call mpp_error(FATAL, "Test gather R8 uniform vector with global pelist failed")
+        endif
+       enddo
+     endif
+
+     call mpp_sync()
+     write(out_unit,*) "Test gather uniform vector with global pelist successful"
+
+     rdata = -1.0
+     if(ANY(pe == pelist(2:npes)))call mpp_gather((/val/),rdata(2:npes),pelist(2:npes))
+     if(pe == pelist(2))then
+       do i=2,npes
+        if(INT(rdata(i)) /= pelist(i))then
+           write(6,*) "Gathered data ",INT(rdata(i)), " NE reference ",pelist(i), "at i=",i
+           call mpp_error(FATAL, "Test gather R8 uniform vector with reduced pelist failed")
+        endif
+       enddo
+     endif
+     call mpp_sync()
+     write(out_unit,*) "Test gather uniform vector with reduced pelist successful"
+
+   end subroutine test_gather_R8
+
+   !> @brief  Test the 1Dvector  mpp_gather routine.
+   !> @todo   Change or refactor this routine to explicitly use FLOAT_KIND and DOUBLE_KIND.
+  subroutine test_gatherV(npes,pe,root,out_unit)
+  implicit none
+     integer, intent(in) :: npes,pe,root,out_unit
+
+     integer :: pelist(npes),rsize(npes)
+     integer :: i,j,k,dsize,ssize
+     real,allocatable :: sdata(:), rdata(:), ref(:)
+
+     if(npes < 3)then
+       call mpp_error(FATAL, "Test_gatherV: minimum of 3 ranks required. Not testing gather; too few ranks.")
+     elseif(npes > 9999)then
+       call mpp_error(FATAL, "Test_gatherV: maximum of 9999 ranks supported. Not testing gatherV; too many ranks.")
+     endif
+     write(out_unit,*)
+
+     ssize = pe+1
+     allocate(sdata(ssize))
+     do i=1,ssize
+       sdata(i) = pe + 0.0001*i
+     enddo
+     do i=1,npes
+       pelist(i) = i-1
+       rsize(i) = i
+     enddo
+
+     dsize = sum(rsize)
+     allocate(rdata(dsize),ref(dsize))
+     rdata = -1.0
+     k=1
+     do j=1,npes
+       do i=1,rsize(j)
+          ref(k) = pelist(j) + 0.0001*i
+          k = k+1
+     enddo;enddo
+
+     call mpp_gather(sdata,ssize,rdata,rsize)
+
+     if(pe == root)then
+       k = 1
+       do j=1,npes
+         do i=1,rsize(j)
+           if(rdata(k) /= ref(k))then
+              write(6,*) "Gathered data ",rdata(k), " NE reference ",ref(k), "at k=",k
+              call mpp_error(FATAL, "Test gatherV global pelist failed")
+           endif
+           k = k+1
+       enddo;enddo
+     endif
+
+     call mpp_sync()
+     write(out_unit,*) "Test gatherV with global pelist successful"
+
+     rdata = -1.0
+     ref(1) = -1.0
+
+     if(ANY(pe == pelist(2:npes)))call mpp_gather(sdata,ssize,rdata(2:),rsize(2:),pelist(2:npes))
+
+     if(pe == pelist(2))then
+       k = 1
+       do j=1,npes
+         do i=1,rsize(j)
+           if(rdata(k) /= ref(k) )then
+              write(6,*) "Gathered data ",rdata(k), " NE reference ",ref(k), "at k=",k
+              call mpp_error(FATAL, "Test gatherV with reduced pelist failed")
+           endif
+           k = k+1
+       enddo;enddo
+     endif
+     call mpp_sync()
+
+     write(out_unit,*) "Test gatherV with reduced pelist successful"
+     deallocate(sdata,rdata,ref)
+  end subroutine test_gatherV
+
+  !> @brief  Test the 2D vector mpp_gather routine.
+  !> @todo   This is a legacy routine which does not work in all conditions. For the gcc version,
+  !> the use of cray pointers is suspect to causing a crash at the call to mpp_gather.
+subroutine test_gather2DV(npes,pe,root,out_unit)
+  implicit none
+     integer, intent(in) :: npes,pe,root,out_unit
+
+     integer :: pelist(npes),rsize(npes)
+     integer :: pelist2(npes),rsize2(npes)
+     integer :: i,j,k,l,nz,ssize,nelems
+     real,allocatable,dimension(:,:) :: data, cdata, sbuff,rbuff
+     real,allocatable :: ref(:,:)
+     integer, parameter :: KSIZE=10
+
+     real :: sbuff1D(size(sbuff))
+     real :: rbuff1D(size(rbuff))
+     pointer(sptr,sbuff1D); pointer(rptr,rbuff1D)
+
+
+     if(npes < 3)then
+       call mpp_error(FATAL, "Test_gather2DV: minimum of 3 ranks required. Not testing gather; too few ranks.")
+     elseif(npes > 9999)then
+       call mpp_error(FATAL, "Test_gather2DV: maximum of 9999 ranks supported. Not testing gather2DV; too many ranks.")
+       return
+     endif
+     write(out_unit,*)
+
+     ssize = pe+1
+     allocate(data(ssize,KSIZE))
+     do k=1,KSIZE; do i=1,ssize
+       data(i,k) = 10000.0*k + pe + 0.0001*i
+     enddo; enddo
+     do i=1,npes
+       pelist(i) = i-1
+       rsize(i) = i
+     enddo
+
+     nz = KSIZE
+     nelems = sum(rsize(:))
+
+     allocate(rbuff(nz,nelems)); rbuff = -1.0
+     allocate(ref(nelems,nz),cdata(nelems,nz))
+     ref = 0.0; cdata = 0.0
+     if(pe == root)then
+       do k=1,KSIZE
+       l=1
+       do j=1,npes
+         do i=1,rsize(j)
+            ref(l,k) = 10000.0*k + pelist(j) + 0.0001*i
+            l = l+1
+       enddo; enddo;enddo
+     endif
+     allocate(sbuff(nz,ssize))
+     ! this matrix inversion makes for easy gather to the IO root
+     ! and a clear, concise unpack
+     do j=1,ssize
+       do i=1,nz
+         sbuff(i,j) = data(j,i)
+     enddo; enddo
+
+  !  Note that the gatherV implied here is asymmetric; only root needs to know the vector of recv size
+     sptr = LOC(sbuff); rptr = LOC(rbuff)
+     call mpp_gather(sbuff1D,size(sbuff),rbuff1D,nz*rsize(:))
+
+     if(pe == root)then
+        do j=1,nz
+           do i=1,nelems
+             cdata(i,j) = rbuff(j,i)
+        enddo; enddo
+        do j=1,nz
+           do i=1,nelems
+            if(cdata(i,j) /= ref(i,j))then
+               write(6,*) "Gathered data ",cdata(i,j), " NE reference ",ref(i,j), "at i,j=",i,j
+               call mpp_error(FATAL, "Test gather2DV global pelist failed")
+            endif
+       enddo;enddo
+     endif
+
+     call mpp_sync()
+     write(out_unit,*) "Test gather2DV with global pelist successful"
+
+     do i=1,npes
+       pelist2(i) = pelist(npes-i+1)
+       rsize2(i) = rsize(npes-i+1)
+     enddo
+
+     rbuff = -1.0
+     ref = 0.0; cdata = 0.0
+     if(pe == pelist2(1))then
+       do k=1,KSIZE
+       l=1
+       do j=1,npes
+         do i=1,rsize2(j)
+            ref(l,k) = 10000.0*k + pelist2(j) + 0.0001*i
+            l = l+1
+       enddo; enddo;enddo
+     endif
+
+     call mpp_gather(sbuff1D,size(sbuff),rbuff1D,nz*rsize2(:),pelist2)
+
+     if(pe == pelist2(1))then
+        do j=1,nz
+           do i=1,nelems
+             cdata(i,j) = rbuff(j,i)
+        enddo; enddo
+        do j=1,nz
+           do i=1,nelems
+            if(cdata(i,j) /= ref(i,j))then
+               write(6,*) "Gathered data ",cdata(i,j), " NE reference ",ref(i,j), "at i,j=",i,j
+               call mpp_error(FATAL, "Test gather2DV with reversed pelist failed")
+            endif
+       enddo;enddo
+     endif
+     call mpp_sync()
+     write(out_unit,*) "Test gather2DV with reversed pelist successful"
+     deallocate(data,sbuff,rbuff,cdata,ref)
+  end subroutine test_gather2DV
+
+end program test_mpp_gatscat
diff --git a/test_fms/mpp/test_mpp_gatscat.sh b/test_fms/mpp/test_mpp_gatscat.sh
new file mode 100755
index 0000000000..52ccfab385
--- /dev/null
+++ b/test_fms/mpp/test_mpp_gatscat.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Ed Hartnett 11/29/19
+
+# Set common test settings.
+. ../test_common.sh
+
+skip_test="no"
+
+# Get the number of available CPUs on the system
+if [ $(command -v nproc) ]
+then
+    # Looks like a linux system
+    nProc=$(nproc)
+elif [ $(command -v sysctl) ]
+then
+    # Looks like a Mac OS X system
+    nProc=$(sysctl -n hw.physicalcpu)
+else
+    nProc=-1
+fi
+
+# Do we need to oversubscribe
+if [ ${nProc} -lt 0 ]
+then
+    # Couldn't get the number of CPUs, skip the test.
+    skip_test="skip"
+elif [ $nProc -lt 4 ]
+then
+    # Need to oversubscribe the MPI
+    run_test test_mpp_gatscat 4 $skip_test "true"
+fi
+
+touch input.nml
+run_test test_mpp_gatscat 4 $skip_test
+
diff --git a/test_fms/mpp/test_mpp_get_ascii_lines b/test_fms/mpp/test_mpp_get_ascii_lines
new file mode 100755
index 0000000000..13b0153302
--- /dev/null
+++ b/test_fms/mpp/test_mpp_get_ascii_lines
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_get_ascii_lines - temporary wrapper script for .libs/test_mpp_get_ascii_lines
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_get_ascii_lines program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_get_ascii_lines:test_mpp_get_ascii_lines:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_get_ascii_lines:test_mpp_get_ascii_lines:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_get_ascii_lines:test_mpp_get_ascii_lines:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_get_ascii_lines'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_global_field.F90 b/test_fms/mpp/test_mpp_global_field.F90
new file mode 100644
index 0000000000..9b90117baf
--- /dev/null
+++ b/test_fms/mpp/test_mpp_global_field.F90
@@ -0,0 +1,1290 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+program test_mpp_global_field
+
+  use platform_mod
+  use compare_data_checksums
+  use compare_data_checksums_int
+  use mpp_mod,         only : mpp_init, mpp_error, FATAL, mpp_init_test_requests_allocated
+  use mpp_mod,         only : mpp_declare_pelist, mpp_pe, mpp_npes, mpp_root_pe
+  !use mpp_mod,        only : mpp_clock_begin, mpp_clock_end, mpp_clock_id, MPP_CLOCK_SYNC, MPP_CLOCK_DETAILED
+  use mpp_domains_mod, only : domain2D
+  use mpp_domains_mod, only : CENTER, EAST, NORTH, CORNER, XUPDATE, YUPDATE
+  use mpp_domains_mod, only : mpp_domains_init, mpp_domains_exit
+  use mpp_domains_mod, only : mpp_define_layout, mpp_define_domains
+  use mpp_domains_mod, only : mpp_get_compute_domain, mpp_get_data_domain, mpp_domains_set_stack_size
+  use mpp_domains_mod, only : mpp_global_field
+
+  implicit none
+
+  integer, parameter :: nx=20, ny=20, nz=40
+  integer, parameter :: whalo=2, ehalo=2, shalo=2, nhalo=2
+  integer, parameter :: stackmax=4000000
+
+  integer :: pe, npes, ierr
+  integer :: layout(2)
+
+
+  !> call mpp_init
+  call mpp_init(test_level=mpp_init_test_requests_allocated)
+
+  !> get pe info
+  pe = mpp_pe()
+  npes = mpp_npes()
+
+  !> initialize mpp domain(s)
+  call mpp_domains_init()
+  call mpp_domains_set_stack_size(stackmax)
+
+  !> call test_global_field_r4_2d
+  call test_global_field_r4_2d( 'Non-symmetry' )
+  call test_global_field_r4_2d( 'Symmetry center' )
+  call test_global_field_r4_2d( 'Symmetry corner' )
+  call test_global_field_r4_2d( 'Symmetry east' )
+  call test_global_field_r4_2d( 'Symmetry north' )
+  !> call test_global_field_r8_2d
+  call test_global_field_r8_2d( 'Non-symmetry' )
+  call test_global_field_r8_2d( 'Symmetry center' )
+  call test_global_field_r8_2d( 'Symmetry corner' )
+  call test_global_field_r8_2d( 'Symmetry east' )
+  call test_global_field_r8_2d( 'Symmetry north' )
+  !> call test_global_field_i4_2d
+  call test_global_field_i4_2d( 'Non-symmetry' )
+  call test_global_field_i4_2d( 'Symmetry center' )
+  call test_global_field_i4_2d( 'Symmetry corner' )
+  call test_global_field_i4_2d( 'Symmetry east' )
+  call test_global_field_i4_2d( 'Symmetry north' )
+  !> call test_global_field_i8_2d
+  call test_global_field_i8_2d( 'Non-symmetry' )
+  call test_global_field_i8_2d( 'Symmetry center' )
+  call test_global_field_i8_2d( 'Symmetry corner' )
+  call test_global_field_i8_2d( 'Symmetry east' )
+  call test_global_field_i8_2d( 'Symmetry north' )
+  !> call test_global_field_r4_3d tests
+  call test_global_field_r4_3d( 'Non-symmetry' )
+  call test_global_field_r4_3d( 'Symmetry center' )
+  call test_global_field_r4_3d( 'Symmetry corner' )
+  call test_global_field_r4_3d( 'Symmetry east' )
+  call test_global_field_r4_3d( 'Symmetry north' )
+  !> call test_global_field_r8_3d tests
+  call test_global_field_r8_3d( 'Non-symmetry' )
+  call test_global_field_r8_3d( 'Symmetry center' )
+  call test_global_field_r8_3d( 'Symmetry corner' )
+  call test_global_field_r8_3d( 'Symmetry east' )
+  call test_global_field_r8_3d( 'Symmetry north' )
+  !> call test_global_field_i4_3d tests
+  call test_global_field_i4_3d( 'Non-symmetry' )
+  call test_global_field_i4_3d( 'Symmetry center' )
+  call test_global_field_i4_3d( 'Symmetry corner' )
+  call test_global_field_i4_3d( 'Symmetry east' )
+  call test_global_field_i4_3d( 'Symmetry north' )
+  !> call test_global_field_i8_3d tests
+  call test_global_field_i8_3d( 'Non-symmetry' )
+  call test_global_field_i8_3d( 'Symmetry center' )
+  call test_global_field_i8_3d( 'Symmetry corner' )
+  call test_global_field_i8_3d( 'Symmetry east' )
+  call test_global_field_i8_3d( 'Symmetry north' )
+
+  !> exit
+  call mpp_domains_exit()
+  call MPI_finalize(ierr)
+
+contains
+  !>
+  !> test_global_field_r4_2d
+  !>
+  subroutine test_global_field_r4_2d( type )
+
+    implicit none
+
+    character(len=*), intent(in) :: type
+
+    real(kind=r4_kind), parameter :: zero = 0.0
+
+    type(domain2D)  :: domain
+    integer         :: position, ishift, jshift, ni, nj, i, j, k
+    integer         :: is, ie, js, je, isd, ied, jsd, jed
+    !integer        :: id
+    integer, allocatable :: pelist(:)
+    real(kind=r4_kind), allocatable  :: global1(:,:), x(:,:), gcheck(:,:)
+
+
+    !> set up domain
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    select case(type)
+    case( 'Non-symmetry' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type )
+    case( 'Symmetry center', 'Symmetry corner', 'Symmetry east', 'Symmetry north' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
+    case default
+       call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
+    end select
+
+    !> get compute domain
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    !> get data domain
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+
+    !> determine if an extra point is needed
+    ishift = 0 ; jshift = 0 ; position=CENTER
+    select case(type)
+    case ('Symmetry corner')
+       ishift = 1 ; jshift = 1 ; position=CORNER
+    case ('Symmetry east')
+       ishift = 1 ; jshift = 0 ; position=EAST
+    case ('Symmetry north')
+       ishift = 0 ; jshift = 1 ; position=NORTH
+    end select
+
+    ie  = ie+ishift  ; je  = je+jshift
+    ied = ied+ishift ; jed = jed+jshift
+    ni  = nx+ishift  ; nj  = ny+jshift
+
+    !> assign global
+    allocate( global1(1-whalo:ni+ehalo,1-shalo:nj+nhalo) )
+    global1 = zero
+    do j=1, nj
+       do i=1, ni
+          global1(i,j) = real( i*1e-3+j*1e-6, kind=r4_kind )
+       end do
+    enddo
+
+    allocate( gcheck(ni,nj) )
+
+    !> allocate for global domain
+    allocate( x(isd:ied,jsd:jed) )
+    x(:,:) = global1(isd:ied,jsd:jed)
+
+    !> test the data on data domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on data domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on r4 data domain' )
+
+
+    !> Since in the disjoint redistribute mpp test, pelist1 = (npes/2+1 .. npes-1)
+    !! will be declared. But for the x-direction global field, mpp_sync_self will
+    !! be called. For some pe count, pelist1 will be set ( only on pe of pelist1 )
+    !! in the mpp_sync_self call, later when calling mpp_declare_pelist(pelist1),
+    !! deadlock will happen. For example npes = 6 and layout = (2,3), pelist = (4,5)
+    !! will be set in mpp_sync_self. To solve the problem, some explicit mpp_declare_pelist
+    !! on all pe is needed for those partial pelist. But for y-update, it is ok.
+    !! because the pelist in y-update is not continous.
+    allocate( pelist(0:layout(1)-1) )
+    do j = 0, layout(2)-1
+       do i = 0, layout(1)-1
+          pelist(i) = j*layout(1) + i
+       end do
+       call mpp_declare_pelist(pelist)
+    end do
+    deallocate(pelist)
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,js:je), gcheck(1:ni,js:je), type//' mpp_global_field xupdate only on r4 data domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(is:ie,1:nj), gcheck(is:ie,1:nj), type//' mpp_global_field yupdate only on r4 data domain' )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on r4 data domain' )
+
+    !> test the data on compute domain
+
+    deallocate(x)
+    allocate( x(is:ie,js:je) )
+    x(is:ie,js:je) = global1(is:ie,js:je)
+
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on compute domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, position=position )
+    !call mpp_clock_end(id)
+    !>compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on r4 compute domain' )
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,js:je), gcheck(1:ni,js:je), type//' mpp_global_field xupdate only on r4 compute domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(is:ie,1:nj), gcheck(is:ie,1:nj), type//' mpp_global_field yupdate only on r4 compute domain' )
+
+    deallocate(global1, gcheck, x)
+
+  end subroutine test_global_field_r4_2d
+  !>
+  !> test_global_field_r8_2d
+  !>
+  subroutine test_global_field_r8_2d( type )
+
+    implicit none
+
+    character(len=*), intent(in) :: type
+
+    real(kind=r8_kind), parameter :: zero = 0.0
+
+    type(domain2D)  :: domain
+    integer         :: position, ishift, jshift, ni, nj, i, j, k
+    integer         :: is, ie, js, je, isd, ied, jsd, jed
+    !integer        :: id
+    integer, allocatable :: pelist(:)
+    real(kind=r8_kind), allocatable  :: global1(:,:), x(:,:), gcheck(:,:)
+
+
+    !> set up domain
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    select case(type)
+    case( 'Non-symmetry' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type )
+    case( 'Symmetry center', 'Symmetry corner', 'Symmetry east', 'Symmetry north' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
+    case default
+       call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
+    end select
+
+    !> get compute domain
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    !> get data domain
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+
+    !> determine if an extra point is needed
+    ishift = 0 ; jshift = 0 ; position=CENTER
+    select case(type)
+    case ('Symmetry corner')
+       ishift = 1 ; jshift = 1 ; position=CORNER
+    case ('Symmetry east')
+       ishift = 1 ; jshift = 0 ; position=EAST
+    case ('Symmetry north')
+       ishift = 0 ; jshift = 1 ; position=NORTH
+    end select
+
+    ie  = ie+ishift  ; je  = je+jshift
+    ied = ied+ishift ; jed = jed+jshift
+    ni  = nx+ishift  ; nj  = ny+jshift
+
+    !> assign global
+    allocate( global1(1-whalo:ni+ehalo,1-shalo:nj+nhalo) )
+    global1 = zero
+    do j=1, nj
+       do i=1, ni
+          global1(i,j) = real( i*1e-3+j*1e-6, kind=r8_kind )
+       end do
+    enddo
+
+    allocate( gcheck(ni,nj) )
+
+    !> allocate for global domain
+    allocate( x(isd:ied,jsd:jed) )
+    x(:,:) = global1(isd:ied,jsd:jed)
+
+    !> test the data on data domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on data domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on r8 data domain' )
+
+
+    !> Since in the disjoint redistribute mpp test, pelist1 = (npes/2+1 .. npes-1)
+    !! will be declared. But for the x-direction global field, mpp_sync_self will
+    !! be called. For some pe count, pelist1 will be set ( only on pe of pelist1 )
+    !! in the mpp_sync_self call, later when calling mpp_declare_pelist(pelist1),
+    !! deadlock will happen. For example npes = 6 and layout = (2,3), pelist = (4,5)
+    !! will be set in mpp_sync_self. To solve the problem, some explicit mpp_declare_pelist
+    !! on all pe is needed for those partial pelist. But for y-update, it is ok.
+    !! because the pelist in y-update is not continous.
+    allocate( pelist(0:layout(1)-1) )
+    do j = 0, layout(2)-1
+       do i = 0, layout(1)-1
+          pelist(i) = j*layout(1) + i
+       end do
+       call mpp_declare_pelist(pelist)
+    end do
+    deallocate(pelist)
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,js:je), gcheck(1:ni,js:je), type//' mpp_global_field xupdate only on r8 data domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(is:ie,1:nj), gcheck(is:ie,1:nj), type//' mpp_global_field yupdate only on r8 data domain' )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on r8 data domain' )
+
+    !> test the data on compute domain
+
+    deallocate(x)
+    allocate( x(is:ie,js:je) )
+    x(is:ie,js:je) = global1(is:ie,js:je)
+
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on compute domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, position=position )
+    !call mpp_clock_end(id)
+    !>compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on r8 compute domain' )
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,js:je), gcheck(1:ni,js:je), type//' mpp_global_field xupdate only on r8 compute domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(is:ie,1:nj), gcheck(is:ie,1:nj), type//' mpp_global_field yupdate only on r8 compute domain' )
+
+    deallocate(global1, gcheck, x)
+
+  end subroutine test_global_field_r8_2d
+  !>
+  !> test_global_field_i4_2d
+  !>
+  subroutine test_global_field_i4_2d( type )
+
+    implicit none
+
+    character(len=*), intent(in) :: type
+
+    integer(kind=i4_kind), parameter :: zero = 0
+
+    type(domain2D)  :: domain
+    integer         :: position, ishift, jshift, ni, nj, i, j, k
+    integer         :: is, ie, js, je, isd, ied, jsd, jed
+    !integer        :: id
+    integer, allocatable :: pelist(:)
+    integer(kind=i4_kind), allocatable  :: global1(:,:), x(:,:), gcheck(:,:)
+
+
+    !> set up domain
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    select case(type)
+    case( 'Non-symmetry' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type )
+    case( 'Symmetry center', 'Symmetry corner', 'Symmetry east', 'Symmetry north' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
+    case default
+       call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
+    end select
+
+    !> get compute domain
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    !> get data domain
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+
+    !> determine if an extra point is needed
+    ishift = 0 ; jshift = 0 ; position=CENTER
+    select case(type)
+    case ('Symmetry corner')
+       ishift = 1 ; jshift = 1 ; position=CORNER
+    case ('Symmetry east')
+       ishift = 1 ; jshift = 0 ; position=EAST
+    case ('Symmetry north')
+       ishift = 0 ; jshift = 1 ; position=NORTH
+    end select
+
+    ie  = ie+ishift  ; je  = je+jshift
+    ied = ied+ishift ; jed = jed+jshift
+    ni  = nx+ishift  ; nj  = ny+jshift
+
+    !> assign global
+    allocate( global1(1-whalo:ni+ehalo,1-shalo:nj+nhalo) )
+    global1 = zero
+    do j=1, nj
+       do i=1, ni
+          global1(i,j) = int( i*1e3+j*1e6, kind=i4_kind )
+       end do
+    enddo
+
+    allocate( gcheck(ni,nj) )
+
+    !> allocate for global domain
+    allocate( x(isd:ied,jsd:jed) )
+    x(:,:) = global1(isd:ied,jsd:jed)
+
+    !> test the data on data domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on data domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on i4 data domain' )
+
+
+    !> Since in the disjoint redistribute mpp test, pelist1 = (npes/2+1 .. npes-1)
+    !! will be declared. But for the x-direction global field, mpp_sync_self will
+    !! be called. For some pe count, pelist1 will be set ( only on pe of pelist1 )
+    !! in the mpp_sync_self call, later when calling mpp_declare_pelist(pelist1),
+    !! deadlock will happen. For example npes = 6 and layout = (2,3), pelist = (4,5)
+    !! will be set in mpp_sync_self. To solve the problem, some explicit mpp_declare_pelist
+    !! on all pe is needed for those partial pelist. But for y-update, it is ok.
+    !! because the pelist in y-update is not continous.
+    allocate( pelist(0:layout(1)-1) )
+    do j = 0, layout(2)-1
+       do i = 0, layout(1)-1
+          pelist(i) = j*layout(1) + i
+       end do
+       call mpp_declare_pelist(pelist)
+    end do
+    deallocate(pelist)
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,js:je), gcheck(1:ni,js:je), type//' mpp_global_field xupdate only on i4 data domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(is:ie,1:nj), gcheck(is:ie,1:nj), type//' mpp_global_field yupdate only on i4 data domain' )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on i4 data domain' )
+
+    !> test the data on compute domain
+
+    deallocate(x)
+    allocate( x(is:ie,js:je) )
+    x(is:ie,js:je) = global1(is:ie,js:je)
+
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on compute domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, position=position )
+    !call mpp_clock_end(id)
+    !>compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on i4 compute domain' )
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,js:je), gcheck(1:ni,js:je), type//' mpp_global_field xupdate only on i4 compute domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(is:ie,1:nj), gcheck(is:ie,1:nj), type//' mpp_global_field yupdate only on i4 compute domain' )
+
+    deallocate(global1, gcheck, x)
+
+  end subroutine test_global_field_i4_2d
+  !>
+  !> test_global_field_i8_2d
+  !>
+  subroutine test_global_field_i8_2d( type )
+
+    implicit none
+
+    character(len=*), intent(in) :: type
+
+    integer(kind=i8_kind), parameter :: zero = 0
+
+    type(domain2D)  :: domain
+    integer         :: position, ishift, jshift, ni, nj, i, j, k
+    integer         :: is, ie, js, je, isd, ied, jsd, jed
+    !integer        :: id
+    integer, allocatable :: pelist(:)
+    integer(kind=i8_kind), allocatable  :: global1(:,:), x(:,:), gcheck(:,:)
+
+
+    !> set up domain
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    select case(type)
+    case( 'Non-symmetry' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type )
+    case( 'Symmetry center', 'Symmetry corner', 'Symmetry east', 'Symmetry north' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
+    case default
+       call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
+    end select
+
+    !> get compute domain
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    !> get data domain
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+
+    !> determine if an extra point is needed
+    ishift = 0 ; jshift = 0 ; position=CENTER
+    select case(type)
+    case ('Symmetry corner')
+       ishift = 1 ; jshift = 1 ; position=CORNER
+    case ('Symmetry east')
+       ishift = 1 ; jshift = 0 ; position=EAST
+    case ('Symmetry north')
+       ishift = 0 ; jshift = 1 ; position=NORTH
+    end select
+
+    ie  = ie+ishift  ; je  = je+jshift
+    ied = ied+ishift ; jed = jed+jshift
+    ni  = nx+ishift  ; nj  = ny+jshift
+
+    !> assign global
+    allocate( global1(1-whalo:ni+ehalo,1-shalo:nj+nhalo) )
+    global1 = zero
+    do j=1, nj
+       do i=1, ni
+          global1(i,j) = int( i*1e3+j*1e6, kind=i8_kind )
+       end do
+    enddo
+
+    allocate( gcheck(ni,nj) )
+
+    !> allocate for global domain
+    allocate( x(isd:ied,jsd:jed) )
+    x(:,:) = global1(isd:ied,jsd:jed)
+
+    !> test the data on data domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on data domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on i8 data domain' )
+
+
+    !> Since in the disjoint redistribute mpp test, pelist1 = (npes/2+1 .. npes-1)
+    !! will be declared. But for the x-direction global field, mpp_sync_self will
+    !! be called. For some pe count, pelist1 will be set ( only on pe of pelist1 )
+    !! in the mpp_sync_self call, later when calling mpp_declare_pelist(pelist1),
+    !! deadlock will happen. For example npes = 6 and layout = (2,3), pelist = (4,5)
+    !! will be set in mpp_sync_self. To solve the problem, some explicit mpp_declare_pelist
+    !! on all pe is needed for those partial pelist. But for y-update, it is ok.
+    !! because the pelist in y-update is not continous.
+    allocate( pelist(0:layout(1)-1) )
+    do j = 0, layout(2)-1
+       do i = 0, layout(1)-1
+          pelist(i) = j*layout(1) + i
+       end do
+       call mpp_declare_pelist(pelist)
+    end do
+    deallocate(pelist)
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,js:je), gcheck(1:ni,js:je), type//' mpp_global_field xupdate only on i8 data domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(is:ie,1:nj), gcheck(is:ie,1:nj), type//' mpp_global_field yupdate only on i8 data domain' )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on i8 data domain' )
+
+    !> test the data on compute domain
+
+    deallocate(x)
+    allocate( x(is:ie,js:je) )
+    x(is:ie,js:je) = global1(is:ie,js:je)
+
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on compute domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, position=position )
+    !call mpp_clock_end(id)
+    !>compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj), gcheck, type//' mpp_global_field on i8 compute domain' )
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,js:je), gcheck(1:ni,js:je), type//' mpp_global_field xupdate only on i8 compute domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je), gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(is:ie,1:nj), gcheck(is:ie,1:nj), type//' mpp_global_field yupdate only on i8 compute domain' )
+
+    deallocate(global1, gcheck, x)
+
+  end subroutine test_global_field_i8_2d
+  !>
+  !> test_global_field_r4_3d
+  !>
+  subroutine test_global_field_r4_3d( type )
+
+    implicit none
+
+    character(len=*), intent(in) :: type
+
+    real(kind=r4_kind)   :: zero = 0.0
+
+    type(domain2D)  :: domain
+    integer         :: position, ishift, jshift, ni, nj, i, j, k
+    integer         :: is, ie, js, je, isd, ied, jsd, jed
+    !integer        :: id
+    integer, allocatable :: pelist(:)
+    real(kind=r4_kind), allocatable  :: global1(:,:,:), x(:,:,:), gcheck(:,:,:)
+
+
+    !> set up domain
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    select case(type)
+    case( 'Non-symmetry' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type )
+    case( 'Symmetry center', 'Symmetry corner', 'Symmetry east', 'Symmetry north' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
+    case default
+       call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
+    end select
+
+    !> get compute domain
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    !> get data domain
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+
+    !> determine if an extra point is needed
+    ishift = 0 ; jshift = 0 ; position = CENTER
+    select case(type)
+    case ('Symmetry corner')
+       ishift = 1 ; jshift = 1 ; position=CORNER
+    case ('Symmetry east')
+       ishift = 1 ; jshift = 0 ; position=EAST
+    case ('Symmetry north')
+       ishift = 0 ; jshift = 1 ; position=NORTH
+    end select
+
+    ie  = ie+ishift  ; je  = je+jshift
+    ied = ied+ishift ; jed = jed+jshift
+    ni  = nx+ishift  ; nj  = ny+jshift
+
+    !> assign global1
+    allocate( global1(1-whalo:ni+ehalo,1-shalo:nj+nhalo,nz) )
+    global1 = zero
+    do k=1, nz
+       do j=1, nj
+          do i=1, ni
+             global1(i,j,k) = real( k+i*1e-3+j*1e-6, kind=r4_kind )
+          end do
+       end do
+    enddo
+
+    allocate( gcheck(ni,nj,nz) )
+
+    !> for data domain
+    allocate( x(isd:ied,jsd:jed, nz) )
+    x(:,:,:) = global1(isd:ied,jsd:jed,:)
+
+    !> test the data on data domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on data domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on r4 data domain' )
+
+    !> Since in the disjoint redistribute mpp test, pelist1 = (npes/2+1 .. npes-1)
+    !! will be declared. But for the x-direction global field, mpp_sync_self will
+    !! be called. For some pe count, pelist1 will be set ( only on pe of pelist1 )
+    !! in the mpp_sync_self call, later when calling mpp_declare_pelist(pelist1),
+    !! deadlock will happen. For example npes = 6 and layout = (2,3), pelist = (4,5)
+    !! will be set in mpp_sync_self. To solve the problem, some explicit mpp_declare_pelist
+    !! on all pe is needed for those partial pelist. But for y-update, it is ok.
+    !! because the pelist in y-update is not continous.
+    allocate( pelist(0:layout(1)-1) )
+    do j = 0, layout(2)-1
+       do i = 0, layout(1)-1
+          pelist(i) = j*layout(1) + i
+       end do
+       call mpp_declare_pelist(pelist)
+    end do
+    deallocate(pelist)
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:),type//' mpp_global_field xupdate only on r4 data domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:),type//' mpp_global_field yupdate only on r4 data domain' )
+
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj,:), gcheck,type//' mpp_global_field on r4 data domain' )
+
+    !> test the data on compute domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on compute domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, position=position )
+    !call mpp_clock_end(id)
+    !>compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on r4 compute domain' )
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:), &
+         type//' mpp_global_field xupdate only on r4 compute domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:), &
+         type//' mpp_global_field yupdate only on r4 compute domain' )
+
+    deallocate(global1, gcheck, x)
+
+  end subroutine test_global_field_r4_3d
+  !>
+  !> test_global_field_r8_3d
+  !>
+  subroutine test_global_field_r8_3d( type )
+
+    implicit none
+
+    character(len=*), intent(in) :: type
+
+    real(kind=r8_kind) :: zero = 0.0
+
+    type(domain2D)  :: domain
+    integer         :: position, ishift, jshift, ni, nj, i, j, k
+    integer         :: is, ie, js, je, isd, ied, jsd, jed
+    !integer        :: id
+    integer, allocatable :: pelist(:)
+    real(kind=r8_kind), allocatable  :: global1(:,:,:), x(:,:,:), gcheck(:,:,:)
+
+
+    !> set up domain
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    select case(type)
+    case( 'Non-symmetry' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type )
+    case( 'Symmetry center', 'Symmetry corner', 'Symmetry east', 'Symmetry north' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
+    case default
+       call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
+    end select
+
+    !> get compute domain
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    !> get data domain
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+
+    !> determine if an extra point is needed
+    ishift = 0 ; jshift = 0 ; position = CENTER
+    select case(type)
+    case ('Symmetry corner')
+       ishift = 1 ; jshift = 1 ; position=CORNER
+    case ('Symmetry east')
+       ishift = 1 ; jshift = 0 ; position=EAST
+    case ('Symmetry north')
+       ishift = 0 ; jshift = 1 ; position=NORTH
+    end select
+
+    ie  = ie+ishift  ; je  = je+jshift
+    ied = ied+ishift ; jed = jed+jshift
+    ni  = nx+ishift  ; nj  = ny+jshift
+
+    !> assign global1
+    allocate( global1(1-whalo:ni+ehalo,1-shalo:nj+nhalo,nz) )
+    global1 = zero
+    do k=1, nz
+       do j=1, nj
+          do i=1, ni
+             global1(i,j,k) = real( k+i*1e-3+j*1e-6, kind=r8_kind )
+          end do
+       end do
+    enddo
+
+    allocate( gcheck(ni,nj,nz) )
+
+    !> for data domain
+    allocate( x(isd:ied,jsd:jed, nz) )
+    x(:,:,:) = global1(isd:ied,jsd:jed,:)
+
+    !> test the data on data domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on data domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on r8 data domain' )
+
+    !> Since in the disjoint redistribute mpp test, pelist1 = (npes/2+1 .. npes-1)
+    !! will be declared. But for the x-direction global field, mpp_sync_self will
+    !! be called. For some pe count, pelist1 will be set ( only on pe of pelist1 )
+    !! in the mpp_sync_self call, later when calling mpp_declare_pelist(pelist1),
+    !! deadlock will happen. For example npes = 6 and layout = (2,3), pelist = (4,5)
+    !! will be set in mpp_sync_self. To solve the problem, some explicit mpp_declare_pelist
+    !! on all pe is needed for those partial pelist. But for y-update, it is ok.
+    !! because the pelist in y-update is not continous.
+    allocate( pelist(0:layout(1)-1) )
+    do j = 0, layout(2)-1
+       do i = 0, layout(1)-1
+          pelist(i) = j*layout(1) + i
+       end do
+       call mpp_declare_pelist(pelist)
+    end do
+    deallocate(pelist)
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:),type//' mpp_global_field xupdate only on r8 data domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:),type//' mpp_global_field yupdate only on r8 data domain' )
+
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj,:), gcheck,type//' mpp_global_field on r8 data domain' )
+
+    !> test the data on compute domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on compute domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, position=position )
+    !call mpp_clock_end(id)
+    !>compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on r8 compute domain' )
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:), &
+         type//' mpp_global_field xupdate only on r8 compute domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:), &
+         type//' mpp_global_field yupdate only on r8 compute domain' )
+
+    deallocate(global1, gcheck, x)
+
+  end subroutine test_global_field_r8_3d
+  !>
+  !> test_global_field_i4_3d
+  !>
+  subroutine test_global_field_i4_3d( type )
+
+    implicit none
+
+    character(len=*), intent(in) :: type
+
+    integer(kind=i4_kind)   :: zero = 0
+
+    type(domain2D)  :: domain
+    integer         :: position, ishift, jshift, ni, nj, i, j, k
+    integer         :: is, ie, js, je, isd, ied, jsd, jed
+    !integer        :: id
+    integer, allocatable :: pelist(:)
+    integer(kind=i4_kind), allocatable  :: global1(:,:,:), x(:,:,:), gcheck(:,:,:)
+
+
+    !> set up domain
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    select case(type)
+    case( 'Non-symmetry' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type )
+    case( 'Symmetry center', 'Symmetry corner', 'Symmetry east', 'Symmetry north' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
+    case default
+       call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
+    end select
+
+    !> get compute domain
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    !> get data domain
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+
+    !> determine if an extra point is needed
+    ishift = 0 ; jshift = 0 ; position = CENTER
+    select case(type)
+    case ('Symmetry corner')
+       ishift = 1 ; jshift = 1 ; position=CORNER
+    case ('Symmetry east')
+       ishift = 1 ; jshift = 0 ; position=EAST
+    case ('Symmetry north')
+       ishift = 0 ; jshift = 1 ; position=NORTH
+    end select
+
+    ie  = ie+ishift  ; je  = je+jshift
+    ied = ied+ishift ; jed = jed+jshift
+    ni  = nx+ishift  ; nj  = ny+jshift
+
+    !> assign global1
+    allocate( global1(1-whalo:ni+ehalo,1-shalo:nj+nhalo,nz) )
+    global1 = zero
+    do k=1, nz
+       do j=1, nj
+          do i=1, ni
+             global1(i,j,k) = int( k+i*1e3+j*1e6, kind=i4_kind )
+          end do
+       end do
+    enddo
+
+    allocate( gcheck(ni,nj,nz) )
+
+    !> for data domain
+    allocate( x(isd:ied,jsd:jed, nz) )
+    x(:,:,:) = global1(isd:ied,jsd:jed,:)
+
+    !> test the data on data domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on data domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on i4 data domain' )
+
+    !> Since in the disjoint redistribute mpp test, pelist1 = (npes/2+1 .. npes-1)
+    !! will be declared. But for the x-direction global field, mpp_sync_self will
+    !! be called. For some pe count, pelist1 will be set ( only on pe of pelist1 )
+    !! in the mpp_sync_self call, later when calling mpp_declare_pelist(pelist1),
+    !! deadlock will happen. For example npes = 6 and layout = (2,3), pelist = (4,5)
+    !! will be set in mpp_sync_self. To solve the problem, some explicit mpp_declare_pelist
+    !! on all pe is needed for those partial pelist. But for y-update, it is ok.
+    !! because the pelist in y-update is not continous.
+    allocate( pelist(0:layout(1)-1) )
+    do j = 0, layout(2)-1
+       do i = 0, layout(1)-1
+          pelist(i) = j*layout(1) + i
+       end do
+       call mpp_declare_pelist(pelist)
+    end do
+    deallocate(pelist)
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:),type//' mpp_global_field xupdate only on i4 data domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:),type//' mpp_global_field yupdate only on i4 data domain' )
+
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj,:), gcheck,type//' mpp_global_field on i4 data domain' )
+
+    !> test the data on compute domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on compute domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, position=position )
+    !call mpp_clock_end(id)
+    !>compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on i4 compute domain' )
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:), &
+         type//' mpp_global_field xupdate only on i4 compute domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:), &
+         type//' mpp_global_field yupdate only on i4 compute domain' )
+
+    deallocate(global1, gcheck, x)
+
+  end subroutine test_global_field_i4_3d
+  !>
+  !> test_global_field_i8_3d
+  !>
+  subroutine test_global_field_i8_3d( type )
+
+    implicit none
+
+    character(len=*), intent(in) :: type
+
+    integer(kind=i8_kind)   :: zero = 0
+
+    type(domain2D)  :: domain
+    integer         :: position, ishift, jshift, ni, nj, i, j, k
+    integer         :: is, ie, js, je, isd, ied, jsd, jed
+    !integer        :: id
+    integer, allocatable :: pelist(:)
+    integer(kind=i8_kind), allocatable  :: global1(:,:,:), x(:,:,:), gcheck(:,:,:)
+
+
+    !> set up domain
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    select case(type)
+    case( 'Non-symmetry' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type )
+    case( 'Symmetry center', 'Symmetry corner', 'Symmetry east', 'Symmetry north' )
+       call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+            shalo=shalo, nhalo=nhalo, name=type, symmetry = .true. )
+    case default
+       call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//type//' in test_global_field' )
+    end select
+
+    !> get compute domain
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    !> get data domain
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+
+    !> determine if an extra point is needed
+    ishift = 0 ; jshift = 0 ; position = CENTER
+    select case(type)
+    case ('Symmetry corner')
+       ishift = 1 ; jshift = 1 ; position=CORNER
+    case ('Symmetry east')
+       ishift = 1 ; jshift = 0 ; position=EAST
+    case ('Symmetry north')
+       ishift = 0 ; jshift = 1 ; position=NORTH
+    end select
+
+    ie  = ie+ishift  ; je  = je+jshift
+    ied = ied+ishift ; jed = jed+jshift
+    ni  = nx+ishift  ; nj  = ny+jshift
+
+    !> assign global1
+    allocate( global1(1-whalo:ni+ehalo,1-shalo:nj+nhalo,nz) )
+    global1 = zero
+    do k=1, nz
+       do j=1, nj
+          do i=1, ni
+             global1(i,j,k) = int( k+i*1e3+j*1e6, kind=i8_kind )
+          end do
+       end do
+    enddo
+
+    allocate( gcheck(ni,nj,nz) )
+
+    !> for data domain
+    allocate( x(isd:ied,jsd:jed, nz) )
+    x(:,:,:) = global1(isd:ied,jsd:jed,:)
+
+    !> test the data on data domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on data domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on i8 data domain' )
+
+    !> Since in the disjoint redistribute mpp test, pelist1 = (npes/2+1 .. npes-1)
+    !! will be declared. But for the x-direction global field, mpp_sync_self will
+    !! be called. For some pe count, pelist1 will be set ( only on pe of pelist1 )
+    !! in the mpp_sync_self call, later when calling mpp_declare_pelist(pelist1),
+    !! deadlock will happen. For example npes = 6 and layout = (2,3), pelist = (4,5)
+    !! will be set in mpp_sync_self. To solve the problem, some explicit mpp_declare_pelist
+    !! on all pe is needed for those partial pelist. But for y-update, it is ok.
+    !! because the pelist in y-update is not continous.
+    allocate( pelist(0:layout(1)-1) )
+    do j = 0, layout(2)-1
+       do i = 0, layout(1)-1
+          pelist(i) = j*layout(1) + i
+       end do
+       call mpp_declare_pelist(pelist)
+    end do
+    deallocate(pelist)
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:),type//' mpp_global_field xupdate only on i8 data domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:),type//' mpp_global_field yupdate only on i8 data domain' )
+
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x, gcheck, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj,:), gcheck,type//' mpp_global_field on i8 data domain' )
+
+    !> test the data on compute domain
+    gcheck = zero
+    !id = mpp_clock_id( type//' global field on compute domain', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, position=position )
+    !call mpp_clock_end(id)
+    !>compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,1:nj,:), gcheck, type//' mpp_global_field on i8 compute domain' )
+
+    !> xupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, flags=XUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(1:ni,js:je,:), gcheck(1:ni,js:je,:), &
+         type//' mpp_global_field xupdate only on i8 compute domain' )
+
+    !> yupdate
+    gcheck = zero
+    !call mpp_clock_begin(id)
+    call mpp_global_field( domain, x(is:ie,js:je,:), gcheck, flags=YUPDATE, position=position )
+    !call mpp_clock_end(id)
+    !> compare checksums between global and x arrays
+    call compare_checksums_int( global1(is:ie,1:nj,:), gcheck(is:ie,1:nj,:), &
+         type//' mpp_global_field yupdate only on i8 compute domain' )
+
+    deallocate(global1, gcheck, x)
+
+  end subroutine test_global_field_i8_3d
+
+end program test_mpp_global_field
diff --git a/test_fms/mpp/test_mpp_global_field.sh b/test_fms/mpp/test_mpp_global_field.sh
new file mode 100755
index 0000000000..1a2e2ff63a
--- /dev/null
+++ b/test_fms/mpp/test_mpp_global_field.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Uriel Ramirez 07/15/2020
+
+# Set common test settings.
+. ../test_common.sh
+
+touch input.nml
+run_test test_mpp_global_field 4
diff --git a/test_fms/mpp/test_mpp_global_field_ug.F90 b/test_fms/mpp/test_mpp_global_field_ug.F90
new file mode 100644
index 0000000000..bc810fbdb0
--- /dev/null
+++ b/test_fms/mpp/test_mpp_global_field_ug.F90
@@ -0,0 +1,700 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+program test_mpp_global_field_ug
+
+  use platform_mod
+  use compare_data_checksums
+  use compare_data_checksums_int
+  use mpp_mod,         only : mpp_init, mpp_error, FATAL, NOTE, mpp_init_test_requests_allocated
+  use mpp_mod,         only : mpp_pe, mpp_npes, mpp_root_pe, mpp_broadcast
+  use mpp_domains_mod, only : mpp_domains_init,  mpp_domains_set_stack_size, mpp_domains_exit
+  use mpp_domains_mod, only : mpp_define_layout, mpp_define_mosaic, mpp_get_compute_domain, mpp_get_compute_domains, mpp_get_data_domain
+  use mpp_domains_mod, only : mpp_get_ug_global_domain, mpp_global_field_ug
+  use mpp_domains_mod, only : domain2D, domainUG, mpp_define_unstruct_domain, mpp_get_UG_domain_tile_id
+  use mpp_domains_mod, only : mpp_get_UG_compute_domain, mpp_pass_SG_to_UG, mpp_pass_UG_to_SG
+  use mpp_domains_mod, only : mpp_get_ug_global_domain, mpp_global_field_ug, mpp_get_tile_id
+
+  implicit none
+
+  integer :: pe, npes
+  integer :: nx=128, ny=128, nz=40, stackmax=4000000
+  integer :: whalo = 2, ehalo=2, shalo=2, nhalo=2
+  integer :: nx_cubic=20, ny_cubic=20, layout_cubic(2)=(/0,0/), layout(2)
+
+  type(domain2D) :: SG_domain
+  type(domainUG) :: UG_domain
+
+  character(10) :: type='Cubic-Grid'
+
+  integer :: ntiles, npes_per_tile
+  integer :: i, j, k, l, n
+  integer :: isc, iec, jsc, jec, isd, ied, jsd, jed
+  integer :: istart, iend, pos, tile
+  integer, allocatable, dimension(:)     :: npts_tile, grid_index, ntiles_grid
+  logical, allocatable, dimension(:,:,:) :: lmask
+  logical :: cubic_grid
+
+  integer :: ierr
+
+  !> call mpp_init
+  call mpp_init(test_level=mpp_init_test_requests_allocated)
+
+  !> get pe info
+  pe   = mpp_pe()
+  npes = mpp_npes()
+
+  !> initialize mpp domain(s)
+  call mpp_domains_init()
+  call mpp_domains_set_stack_size(stackmax)
+
+  call setup_domains()
+
+  call mpp_global_field_ug_r4()
+  call mpp_global_field_ug_r8()
+  call mpp_global_field_ug_i4()
+  call mpp_global_field_ug_i8()
+
+  call mpp_domains_exit()
+  call MPI_FINALIZE(ierr)
+
+
+contains
+  !>
+  !> mpp_global_field_ug_r4 BOTH 2D and 3D arrays
+  !>
+  subroutine mpp_global_field_ug_r4()
+
+    implicit none
+
+    real(kind=r4_kind) :: zero = 0.0, one=1.0
+    real(kind=r4_kind), allocatable, dimension(:,:)   :: x1, x2, g1, g2
+    real(kind=r4_kind), allocatable, dimension(:,:,:) :: a1, a2, gdata
+
+    integer :: ism, iem, jsm, jem, lsg, leg
+
+    !--- set up data
+    allocate(gdata(nx,ny,ntiles))  ;  gdata = -one
+    do n = 1, ntiles
+       do j = 1, ny
+          do i = 1, nx
+             if( lmask(i,j,n) ) gdata(i,j,n) = real( n*1.e+3 + i + j*1.e-3, kind=r4_kind )
+          end do
+       end do
+    end do
+
+    !--- test the 2-D data is on computing domain
+    allocate( a1(isc:iec, jsc:jec,1), a2(isc:iec,jsc:jec,1 ) )
+    tile = mpp_pe()/npes_per_tile + 1
+    do j = jsc, jec
+       do i = isc, iec
+          a1(i,j,1) = gdata(i,j,tile)
+       enddo
+    enddo
+    a2 = -one
+
+    allocate(x1(istart:iend,1), x2(istart:iend,1))
+    x1 = -one  ;  x2 = -one
+    !--- fill the value of x2
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    pos = 0
+    do n = 1, tile-1
+       pos = pos + npts_tile(n)
+    enddo
+    do l = istart, iend
+       i = mod((grid_index(pos+l)-1), nx) + 1
+       j = (grid_index(pos+l)-1)/nx + 1
+       x2(l,1) = gdata(i,j,tile)
+    enddo
+
+    call mpp_pass_SG_to_UG(UG_domain, a1(:,:,1), x1(:,1))
+    call compare_checksums(x1, x2, type//' SG2UG 2-D compute domain')
+    call mpp_pass_UG_to_SG(UG_domain, x1(:,1), a2(:,:,1))
+
+    call compare_checksums(a1(:,:,1:1),a2(:,:,1:1),type//' UG2SG 2-D compute domain')
+    deallocate(a1,a2,x1,x2)
+
+    !--- test the 3-D data is on computing domain
+    allocate( a1(isc:iec, jsc:jec,nz), a2(isc:iec,jsc:jec,nz ) )
+
+    tile = mpp_pe()/npes_per_tile + 1
+    do k = 1, nz
+       do j = jsc, jec
+          do i = isc, iec
+             a1(i,j,k) = gdata(i,j,tile)
+             if(a1(i,j,k) .NE. -one ) a1(i,j,k) = real( a1(i,j,k) + k*1.e-6, kind=r4_kind )
+          enddo
+       enddo
+    enddo
+    a2 = -one
+
+    allocate(x1(istart:iend,nz), x2(istart:iend,nz))
+    x1 = -one  ;  x2 = -one
+    !--- fill the value of x2
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    pos = 0
+    do n = 1, tile-1
+       pos = pos + npts_tile(n)
+    enddo
+    do l = istart, iend
+       i = mod((grid_index(pos+l)-1), nx) + 1
+       j = (grid_index(pos+l)-1)/nx + 1
+       do k = 1, nz
+          x2(l,k) = real( gdata(i,j,tile) + k*1.e-6, kind=r4_kind )
+       enddo
+    enddo
+
+    call mpp_pass_SG_to_UG(UG_domain, a1, x1)
+    call compare_checksums(x1, x2, type//' SG2UG 3-D data domain')
+    call mpp_pass_UG_to_SG(UG_domain, x1, a2)
+
+    call compare_checksums(a1,a2,type//' UG2SG 3-D data domain')
+    deallocate(a1,a2,x1,x2)
+
+    !----------------------------------------------------------------
+    !    test mpp_global_field_ug
+    !----------------------------------------------------------------
+    call mpp_get_UG_global_domain(UG_domain, lsg, leg)
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    allocate(g1(lsg:leg,nz), g2(lsg:leg,nz), x1(istart:iend,nz))
+    g1 = zero ;  g2 =zero  ;  x1 = zero
+    do k = 1, nz
+       do l = lsg, leg
+          g1(l,k) = real( tile*1.e6 + l + k*1.e-3, kind=r4_kind )
+       enddo
+       do l = istart, iend
+          x1(l,k) = g1(l,k)
+       enddo
+    enddo
+
+    call mpp_global_field_ug(UG_domain, x1, g2)
+    call compare_checksums(g1,g2,type//' global_field_ug 3-D')
+
+    g2 = zero
+    call mpp_global_field_ug(UG_domain, x1(:,1), g2(:,1))
+    call compare_checksums(g1(:,1:1),g2(:,1:1),type//' global_field_ug 2-D')
+
+    deallocate(g1,g2,x1)
+
+  end subroutine mpp_global_field_ug_r4
+  !>
+  !> mpp_global_field_ug_r8 BOTH 2D and 3D arrays
+  !>
+  subroutine mpp_global_field_ug_r8()
+
+    implicit none
+
+    real(kind=r8_kind) :: zero = 0.0, one=1.0
+    real(kind=r8_kind),allocatable, dimension(:,:)     :: x1, x2, g1, g2
+    real(kind=r8_kind),allocatable, dimension(:,:,:)   :: a1, a2, gdata
+
+    integer :: ism, iem, jsm, jem, lsg, leg
+
+    !--- set up data
+    allocate(gdata(nx,ny,ntiles))  ;  gdata = -one
+    do n = 1, ntiles
+       do j = 1, ny
+          do i = 1, nx
+             if( lmask(i,j,n) ) gdata(i,j,n) = real( n*1.e+3 + i + j*1.e-3, kind=r8_kind )
+          end do
+       end do
+    end do
+
+    !--- test the 2-D data is on computing domain
+    allocate( a1(isc:iec, jsc:jec,1), a2(isc:iec,jsc:jec,1 ) )
+    tile = mpp_pe()/npes_per_tile + 1
+    do j = jsc, jec
+       do i = isc, iec
+          a1(i,j,1) = gdata(i,j,tile)
+       enddo
+    enddo
+    a2 = -one
+
+    allocate(x1(istart:iend,1), x2(istart:iend,1))
+    x1 = -one  ;  x2 = -one
+    !--- fill the value of x2
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    pos = 0
+    do n = 1, tile-1
+       pos = pos + npts_tile(n)
+    enddo
+    do l = istart, iend
+       i = mod((grid_index(pos+l)-1), nx) + 1
+       j = (grid_index(pos+l)-1)/nx + 1
+       x2(l,1) = gdata(i,j,tile)
+    enddo
+
+    call mpp_pass_SG_to_UG(UG_domain, a1(:,:,1), x1(:,1))
+    call compare_checksums(x1, x2, type//' SG2UG 2-D compute domain')
+    call mpp_pass_UG_to_SG(UG_domain, x1(:,1), a2(:,:,1))
+
+    call compare_checksums(a1(:,:,1:1),a2(:,:,1:1),type//' UG2SG 2-D compute domain')
+    deallocate(a1,a2,x1,x2)
+
+    !--- test the 3-D data is on computing domain
+    allocate( a1(isc:iec, jsc:jec,nz), a2(isc:iec,jsc:jec,nz ) )
+
+    tile = mpp_pe()/npes_per_tile + 1
+    do k = 1, nz
+       do j = jsc, jec
+          do i = isc, iec
+             a1(i,j,k) = gdata(i,j,tile)
+             if(a1(i,j,k) .NE. -one ) a1(i,j,k) = real( a1(i,j,k) + k*1.e-6, kind=r8_kind )
+          enddo
+       enddo
+    enddo
+    a2 = -one
+
+    allocate(x1(istart:iend,nz), x2(istart:iend,nz))
+    x1 = -one  ;  x2 = -one
+    !--- fill the value of x2
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    pos = 0
+    do n = 1, tile-1
+       pos = pos + npts_tile(n)
+    enddo
+    do l = istart, iend
+       i = mod((grid_index(pos+l)-1), nx) + 1
+       j = (grid_index(pos+l)-1)/nx + 1
+       do k = 1, nz
+          x2(l,k) = real( gdata(i,j,tile) + k*1.e-6, kind=r8_kind )
+       enddo
+    enddo
+
+    call mpp_pass_SG_to_UG(UG_domain, a1, x1)
+    call compare_checksums(x1, x2, type//' SG2UG 3-D data domain')
+    call mpp_pass_UG_to_SG(UG_domain, x1, a2)
+
+    call compare_checksums(a1,a2,type//' UG2SG 3-D data domain')
+    deallocate(a1,a2,x1,x2)
+
+    !----------------------------------------------------------------
+    !    test mpp_global_field_ug
+    !----------------------------------------------------------------
+    call mpp_get_UG_global_domain(UG_domain, lsg, leg)
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    allocate(g1(lsg:leg,nz), g2(lsg:leg,nz), x1(istart:iend,nz))
+    g1 = zero ;  g2 =zero  ;  x1 = zero
+    do k = 1, nz
+       do l = lsg, leg
+          g1(l,k) = real( tile*1.e6 + l + k*1.e-3, kind=r8_kind )
+       enddo
+       do l = istart, iend
+          x1(l,k) = g1(l,k)
+       enddo
+    enddo
+
+    call mpp_global_field_ug(UG_domain, x1, g2)
+    call compare_checksums(g1,g2,type//' global_field_ug 3-D')
+
+    g2 = zero
+    call mpp_global_field_ug(UG_domain, x1(:,1), g2(:,1))
+    call compare_checksums(g1(:,1:1),g2(:,1:1),type//' global_field_ug 2-D')
+
+    deallocate(g1,g2,x1)
+
+  end subroutine mpp_global_field_ug_r8
+  !>
+  !> mpp_global_field_ug_i4 BOTH 2D and 3D arrays
+  !>
+  subroutine mpp_global_field_ug_i4()
+
+    implicit none
+
+    integer(kind=i4_kind) :: zero = 0, one=1
+    integer(kind=i4_kind),allocatable, dimension(:,:)     :: x1, x2, g1, g2
+    integer(kind=i4_kind),allocatable, dimension(:,:,:)   :: a1, a2, gdata
+
+    integer :: ism, iem, jsm, jem, lsg, leg
+
+    !--- set up data
+    allocate(gdata(nx,ny,ntiles))  ;  gdata = -one
+    do n = 1, ntiles
+       do j = 1, ny
+          do i = 1, nx
+             if( lmask(i,j,n) ) gdata(i,j,n) = int( n*1e+6 + i*1e3 + j*1e2, kind=i4_kind )
+          end do
+       end do
+    end do
+
+    !--- test the 2-D data is on computing domain
+    allocate( a1(isc:iec, jsc:jec,1), a2(isc:iec,jsc:jec,1 ) )
+    tile = mpp_pe()/npes_per_tile + 1
+    do j = jsc, jec
+       do i = isc, iec
+          a1(i,j,1) = gdata(i,j,tile)
+       enddo
+    enddo
+    a2 = -one
+
+    allocate(x1(istart:iend,1), x2(istart:iend,1))
+    x1 = -one  ;  x2 = -one
+    !--- fill the value of x2
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    pos = 0
+    do n = 1, tile-1
+       pos = pos + npts_tile(n)
+    enddo
+    do l = istart, iend
+       i = mod((grid_index(pos+l)-1), nx) + 1
+       j = (grid_index(pos+l)-1)/nx + 1
+       x2(l,1) = gdata(i,j,tile)
+    enddo
+
+    call mpp_pass_SG_to_UG(UG_domain, a1(:,:,1), x1(:,1))
+    call compare_checksums_int(x1, x2, type//' SG2UG 2-D compute domain')
+    call mpp_pass_UG_to_SG(UG_domain, x1(:,1), a2(:,:,1))
+
+    call compare_checksums_int(a1(:,:,1:1),a2(:,:,1:1),type//' UG2SG 2-D compute domain')
+    deallocate(a1,a2,x1,x2)
+
+    !--- test the 3-D data is on computing domain
+    allocate( a1(isc:iec, jsc:jec,nz), a2(isc:iec,jsc:jec,nz ) )
+
+    tile = mpp_pe()/npes_per_tile + 1
+    do k = 1, nz
+       do j = jsc, jec
+          do i = isc, iec
+             a1(i,j,k) = gdata(i,j,tile)
+             if(a1(i,j,k) .NE. -one ) a1(i,j,k) = int( a1(i,j,k) + k*1e6, kind=i4_kind )
+          enddo
+       enddo
+    enddo
+    a2 = -one
+
+    allocate(x1(istart:iend,nz), x2(istart:iend,nz))
+    x1 = -one  ;  x2 = -one
+    !--- fill the value of x2
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    pos = 0
+    do n = 1, tile-1
+       pos = pos + npts_tile(n)
+    enddo
+    do l = istart, iend
+       i = mod((grid_index(pos+l)-1), nx) + 1
+       j = (grid_index(pos+l)-1)/nx + 1
+       do k = 1, nz
+          x2(l,k) = int( gdata(i,j,tile) + k*1e6, kind=i4_kind )
+       enddo
+    enddo
+
+    call mpp_pass_SG_to_UG(UG_domain, a1, x1)
+    call compare_checksums_int(x1, x2, type//' SG2UG 3-D data domain')
+    call mpp_pass_UG_to_SG(UG_domain, x1, a2)
+
+    call compare_checksums_int(a1,a2,type//' UG2SG 3-D data domain')
+    deallocate(a1,a2,x1,x2)
+
+    !----------------------------------------------------------------
+    !    test mpp_global_field_ug
+    !----------------------------------------------------------------
+    call mpp_get_UG_global_domain(UG_domain, lsg, leg)
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    allocate(g1(lsg:leg,nz), g2(lsg:leg,nz), x1(istart:iend,nz))
+    g1 = zero ;  g2 =zero  ;  x1 = zero
+    do k = 1, nz
+       do l = lsg, leg
+          g1(l,k) = int( n*1e+6 + i*1e3 + j*1e2, kind=i4_kind )
+       enddo
+       do l = istart, iend
+          x1(l,k) = g1(l,k)
+       enddo
+    enddo
+
+    call mpp_global_field_ug(UG_domain, x1, g2)
+    call compare_checksums_int(g1,g2,type//' global_field_ug 3-D')
+
+    g2 = zero
+    call mpp_global_field_ug(UG_domain, x1(:,1), g2(:,1))
+    call compare_checksums_int(g1(:,1:1),g2(:,1:1),type//' global_field_ug 2-D')
+
+    deallocate(g1,g2,x1)
+
+  end subroutine mpp_global_field_ug_i4
+  !>
+  !> mpp_global_field_ug_i8 BOTH 2D and 3D arrays
+  !>
+  subroutine mpp_global_field_ug_i8()
+
+    implicit none
+
+    integer(kind=i8_kind) :: zero = 0, one=1
+    integer(kind=i8_kind),allocatable, dimension(:,:)   :: x1, x2, g1, g2
+    integer(kind=i8_kind),allocatable, dimension(:,:,:) :: a1, a2, gdata
+
+    integer :: ism, iem, jsm, jem, lsg, leg
+
+    !> interface for mpp_pass_SG_to_UG for i8 does not exist
+
+    call mpp_get_UG_global_domain(UG_domain, lsg, leg)
+    tile = mpp_get_UG_domain_tile_id(UG_domain)
+    allocate(g1(lsg:leg,nz), g2(lsg:leg,nz), x1(istart:iend,nz))
+    g1 = zero ;  g2 =zero  ;  x1 = zero
+    do k = 1, nz
+       do l = lsg, leg
+          g1(l,k) = int( n*1e+6 + i*1e3 + j*1e2, kind=i8_kind )
+       enddo
+       do l = istart, iend
+          x1(l,k) = g1(l,k)
+       enddo
+    enddo
+
+    call mpp_global_field_ug(UG_domain, x1, g2)
+    call compare_checksums_int(g1,g2,type//' global_field_ug 3-D')
+
+    g2 = zero
+    call mpp_global_field_ug(UG_domain, x1(:,1), g2(:,1))
+    call compare_checksums_int(g1(:,1:1),g2(:,1:1),type//' global_field_ug 2-D')
+
+    deallocate(g1,g2,x1)
+
+  end subroutine mpp_global_field_ug_i8
+  !>
+  !> setup_domains
+  !>
+  subroutine setup_domains()
+
+    implicit none
+
+    integer :: num_contact, shift, ntotal_land
+
+    integer, allocatable, dimension(:)   :: isl, iel, jsl, jel
+    integer, allocatable, dimension(:)   :: pe_start, pe_end
+    integer, allocatable, dimension(:,:) :: layout2D, global_indices
+    real,    allocatable, dimension(:,:) :: rmask
+    real,    allocatable, dimension(:)   :: frac_crit
+
+
+    !--- check the type
+    select case(type)
+    case ( 'Cubic-Grid' )
+       if( nx_cubic == 0 ) &
+            call mpp_error(FATAL,'test_unstruct_update: for Cubic_grid mosaic, nx_cubic is zero, '//&
+            'No test is done for Cubic-Grid mosaic. ' )
+       if( nx_cubic .NE. ny_cubic ) &
+            call mpp_error(FATAL,'test_unstruct_update: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
+            'No test is done for Cubic-Grid mosaic. ' )
+       nx = nx_cubic
+       ny = ny_cubic
+       ntiles = 6
+       num_contact = 12
+       cubic_grid = .true.
+       if( mod(npes, ntiles) == 0 ) then
+          npes_per_tile = npes/ntiles
+          write(*,*)'NOTE from test_unstruct_update ==> For Mosaic "', trim(type), &
+               '", each tile will be distributed over ', npes_per_tile, ' processors.'
+       else
+          call mpp_error(FATAL,'test_unstruct_update: npes should be multiple of ntiles No test is done for '//trim(type))
+       endif
+       if(layout_cubic(1)*layout_cubic(2) == npes_per_tile) then
+          layout = layout_cubic
+       else
+          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+       endif
+       allocate(frac_crit(ntiles))
+       frac_crit(1) = 0.3; frac_crit(2) = 0.1; frac_crit(3) = 0.6
+       frac_crit(4) = 0.2; frac_crit(5) = 0.4; frac_crit(6) = 0.5
+    case default
+       call mpp_error(FATAL, 'test_group_update: no such test: '//type)
+    end select
+
+    allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
+    do n = 1, ntiles
+       pe_start(n) = (n-1)*npes_per_tile
+       pe_end(n)   = n*npes_per_tile-1
+    end do
+
+    do n = 1, ntiles
+       global_indices(:,n) = (/1,nx,1,ny/)
+       layout2D(:,n)       = layout
+    end do
+
+    !--- define domain
+    if( cubic_grid ) call define_cubic_mosaic(type, SG_domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
+         global_indices, layout2D, pe_start, pe_end )
+
+    !--- setup data
+    call mpp_get_compute_domain( SG_domain, isc, iec, jsc, jec )
+    call mpp_get_data_domain   ( SG_domain, isd, ied, jsd, jed )
+
+    allocate( lmask(nx,ny,ntiles), npts_tile(ntiles) )
+    lmask = .false.
+    if(mpp_pe() == mpp_root_pe() ) then
+       allocate( rmask(nx,ny) )
+       !--- construct gmask.
+       do n = 1, ntiles
+          call random_number(rmask)
+          do j = 1, ny
+             do i = 1, nx
+                if(rmask(i,j) > frac_crit(n)) lmask(i,j,n) = .true.
+             enddo
+          enddo
+          npts_tile(n) = count(lmask(:,:,n))
+       enddo
+
+       ntotal_land = sum(npts_tile)
+       allocate(grid_index(ntotal_land))
+       allocate(isl(0:mpp_npes()-1), iel(0:mpp_npes()-1))
+       allocate(jsl(0:mpp_npes()-1), jel(0:mpp_npes()-1))
+       call mpp_get_compute_domains(SG_domain,xbegin=isl,xend=iel,ybegin=jsl,yend=jel)
+
+       l = 0
+       do n = 1, ntiles
+          do j = 1, ny
+             do i = 1, nx
+                if(lmask(i,j,n)) then
+                   l = l + 1
+                   grid_index(l) = (j-1)*nx+i
+                endif
+             enddo
+          enddo
+       enddo
+
+       deallocate(rmask, isl, iel, jsl, jel)
+
+    end if
+
+    call mpp_broadcast(npts_tile, ntiles, mpp_root_pe())
+    if(mpp_pe() .NE. mpp_root_pe()) then
+       ntotal_land = sum(npts_tile)
+       allocate(grid_index(ntotal_land))
+    endif
+    call mpp_broadcast(grid_index, ntotal_land, mpp_root_pe())
+
+    allocate(ntiles_grid(ntotal_land))
+    ntiles_grid = 1
+    !--- define the unstructured grid domain
+    call mpp_define_unstruct_domain(UG_domain, SG_domain, npts_tile, ntiles_grid, mpp_npes(), 1, grid_index, name="LAND unstruct")
+    call mpp_get_UG_compute_domain(UG_domain, istart, iend)
+
+    !--- figure out lmask according to grid_index
+    pos = 0
+    do n = 1, ntiles
+       do l = 1, npts_tile(n)
+          pos = pos + 1
+          j = (grid_index(pos)-1)/nx + 1
+          i = mod((grid_index(pos)-1),nx) + 1
+          lmask(i,j,n) = .true.
+       enddo
+    enddo
+
+  end subroutine setup_domains
+  !>
+  !> define_cubic_mosaic
+  !>
+  subroutine define_cubic_mosaic(type, domain, ni, nj, global_indices, layout, pe_start, pe_end, use_memsize)
+
+    implicit none
+
+    character(len=*), intent(in)  :: type
+    type(domain2d), intent(inout) :: domain
+    integer,        intent(in)    :: global_indices(:,:), layout(:,:)
+    integer,        intent(in)    :: ni(:), nj(:)
+    integer,        intent(in)    :: pe_start(:), pe_end(:)
+    logical, optional, intent(in) :: use_memsize
+    integer, dimension(12)        :: istart1, iend1, jstart1, jend1, tile1
+    integer, dimension(12)        :: istart2, iend2, jstart2, jend2, tile2
+    integer                       :: ntiles, num_contact, msize(2)
+    logical                       :: use_memsize_local
+
+    use_memsize_local = .true.
+    if(present(use_memsize)) use_memsize_local = use_memsize
+
+    ntiles = 6
+    num_contact = 12
+    if(size(pe_start(:)) .NE. 6 .OR. size(pe_end(:)) .NE. 6 ) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of pe_start and pe_end should be 6")
+    if(size(global_indices,1) .NE. 4) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of first dimension of global_indices should be 4")
+    if(size(global_indices,2) .NE. 6) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of second dimension of global_indices should be 6")
+    if(size(layout,1) .NE. 2) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of first dimension of layout should be 2")
+    if(size(layout,2) .NE. 6) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of second dimension of layout should be 6")
+    if(size(ni(:)) .NE. 6 .OR. size(nj(:)) .NE. 6) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of ni and nj should be 6")
+
+    !--- Contact line 1, between tile 1 (EAST) and tile 2 (WEST)
+    tile1(1) = 1; tile2(1) = 2
+    istart1(1) = ni(1);  iend1(1) = ni(1);  jstart1(1) = 1;      jend1(1) = nj(1)
+    istart2(1) = 1;      iend2(1) = 1;      jstart2(1) = 1;      jend2(1) = nj(2)
+    !--- Contact line 2, between tile 1 (NORTH) and tile 3 (WEST)
+    tile1(2) = 1; tile2(2) = 3
+    istart1(2) = 1;      iend1(2) = ni(1);  jstart1(2) = nj(1);  jend1(2) = nj(1)
+    istart2(2) = 1;      iend2(2) = 1;      jstart2(2) = nj(3);  jend2(2) = 1
+    !--- Contact line 3, between tile 1 (WEST) and tile 5 (NORTH)
+    tile1(3) = 1; tile2(3) = 5
+    istart1(3) = 1;      iend1(3) = 1;      jstart1(3) = 1;      jend1(3) = nj(1)
+    istart2(3) = ni(5);  iend2(3) = 1;      jstart2(3) = nj(5);  jend2(3) = nj(5)
+    !--- Contact line 4, between tile 1 (SOUTH) and tile 6 (NORTH)
+    tile1(4) = 1; tile2(4) = 6
+    istart1(4) = 1;      iend1(4) = ni(1);  jstart1(4) = 1;      jend1(4) = 1
+    istart2(4) = 1;      iend2(4) = ni(6);  jstart2(4) = nj(6);  jend2(4) = nj(6)
+    !--- Contact line 5, between tile 2 (NORTH) and tile 3 (SOUTH)
+    tile1(5) = 2; tile2(5) = 3
+    istart1(5) = 1;      iend1(5) = ni(2);  jstart1(5) = nj(2);  jend1(5) = nj(2)
+    istart2(5) = 1;      iend2(5) = ni(3);  jstart2(5) = 1;      jend2(5) = 1
+    !--- Contact line 6, between tile 2 (EAST) and tile 4 (SOUTH)
+    tile1(6) = 2; tile2(6) = 4
+    istart1(6) = ni(2);  iend1(6) = ni(2);  jstart1(6) = 1;      jend1(6) = nj(2)
+    istart2(6) = ni(4);  iend2(6) = 1;      jstart2(6) = 1;      jend2(6) = 1
+    !--- Contact line 7, between tile 2 (SOUTH) and tile 6 (EAST)
+    tile1(7) = 2; tile2(7) = 6
+    istart1(7) = 1;      iend1(7) = ni(2);  jstart1(7) = 1;      jend1(7) = 1
+    istart2(7) = ni(6);  iend2(7) = ni(6);  jstart2(7) = nj(6);  jend2(7) = 1
+    !--- Contact line 8, between tile 3 (EAST) and tile 4 (WEST)
+    tile1(8) = 3; tile2(8) = 4
+    istart1(8) = ni(3);  iend1(8) = ni(3);  jstart1(8) = 1;      jend1(8) = nj(3)
+    istart2(8) = 1;      iend2(8) = 1;      jstart2(8) = 1;      jend2(8) = nj(4)
+    !--- Contact line 9, between tile 3 (NORTH) and tile 5 (WEST)
+    tile1(9) = 3; tile2(9) = 5
+    istart1(9) = 1;      iend1(9) = ni(3);  jstart1(9) = nj(3);  jend1(9) = nj(3)
+    istart2(9) = 1;      iend2(9) = 1;      jstart2(9) = nj(5);  jend2(9) = 1
+    !--- Contact line 10, between tile 4 (NORTH) and tile 5 (SOUTH)
+    tile1(10) = 4; tile2(10) = 5
+    istart1(10) = 1;     iend1(10) = ni(4); jstart1(10) = nj(4); jend1(10) = nj(4)
+    istart2(10) = 1;     iend2(10) = ni(5); jstart2(10) = 1;     jend2(10) = 1
+    !--- Contact line 11, between tile 4 (EAST) and tile 6 (SOUTH)
+    tile1(11) = 4; tile2(11) = 6
+    istart1(11) = ni(4); iend1(11) = ni(4); jstart1(11) = 1;     jend1(11) = nj(4)
+    istart2(11) = ni(6); iend2(11) = 1;     jstart2(11) = 1;     jend2(11) = 1
+    !--- Contact line 12, between tile 5 (EAST) and tile 6 (WEST)
+    tile1(12) = 5; tile2(12) = 6
+    istart1(12) = ni(5); iend1(12) = ni(5); jstart1(12) = 1;     jend1(12) = nj(5)
+    istart2(12) = 1;     iend2(12) = 1;     jstart2(12) = 1;     jend2(12) = nj(6)
+    msize(1) = maxval(ni(:)/layout(1,:)) + whalo + ehalo + 1 ! make sure memory domain size is no smaller than
+    msize(2) = maxval(nj(:)/layout(2,:)) + shalo + nhalo + 1 ! data domain size
+
+    if(use_memsize_local) then
+       call mpp_define_mosaic(global_indices, layout, domain, ntiles, num_contact, tile1, tile2, &
+         istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+         pe_start, pe_end, symmetry = .true., whalo=whalo, ehalo=ehalo,   &
+         shalo=shalo, nhalo=nhalo, name = trim(type), memory_size = msize  )
+    else
+       call mpp_define_mosaic(global_indices, layout, domain, ntiles, num_contact, tile1, tile2, &
+         istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+         pe_start, pe_end, symmetry = .true., whalo=whalo, ehalo=ehalo,   &
+         shalo=shalo, nhalo=nhalo, name = trim(type) )
+    endif
+
+    return
+
+  end subroutine define_cubic_mosaic
+
+end program test_mpp_global_field_ug
diff --git a/test_fms/mpp/test_mpp_global_field_ug.sh b/test_fms/mpp/test_mpp_global_field_ug.sh
new file mode 100755
index 0000000000..d0f961ad61
--- /dev/null
+++ b/test_fms/mpp/test_mpp_global_field_ug.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Uriel Ramirez 07/15/2020
+
+# Set common test settings.
+. ../test_common.sh
+
+touch input.nml
+run_test test_mpp_global_field_ug 6
diff --git a/test_fms/mpp/test_mpp_global_sum_ad.F90 b/test_fms/mpp/test_mpp_global_sum_ad.F90
new file mode 100644
index 0000000000..696d732d40
--- /dev/null
+++ b/test_fms/mpp/test_mpp_global_sum_ad.F90
@@ -0,0 +1,1042 @@
+!***********************************************************************
+!*                   Gnu Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!************************************************************
+!> @author Jessica Liptak
+!> @email gfdl.climate.model.info@noaa.gov
+!> @description Test the mpp_global_sum_ad interfaces with 32-bit and 64-bit
+!! real and integer arrays. mpp_global_sum_ad computes the global adjoint sum of
+!! a field.
+program test_mpp_global_sum_ad
+
+  use mpp_mod,         only : FATAL, MPP_DEBUG
+  use mpp_mod,         only : mpp_pe, mpp_npes, mpp_root_pe, mpp_error, mpp_sum
+  use mpp_mod,         only : mpp_init, stdout, stderr
+  use mpp_mod,         only : mpp_get_current_pelist, mpp_broadcast
+  use mpp_mod,         only : mpp_init_test_requests_allocated
+  use mpp_domains_mod, only : BITWISE_EXACT_SUM
+  use mpp_domains_mod, only : CYCLIC_GLOBAL_DOMAIN
+  use mpp_domains_mod, only : domain2D
+  use mpp_domains_mod, only : mpp_get_data_domain, mpp_domains_set_stack_size
+  use mpp_domains_mod, only : mpp_global_sum
+  use mpp_domains_mod, only : mpp_domains_init, mpp_domains_exit, mpp_broadcast_domain
+  use mpp_domains_mod, only : mpp_update_domains, mpp_check_field
+  use mpp_domains_mod, only : mpp_define_layout, mpp_define_domains
+  use mpp_domains_mod, only : NORTH, EAST, CORNER, CENTER
+  use mpp_domains_mod, only : mpp_global_sum_ad
+  use mpp_io_mod,      only : mpp_io_init
+  use platform_mod
+
+
+  implicit none
+  integer :: pe, npes
+  integer :: nx=128, ny=128, nz=2, stackmax=4000000
+  integer :: layout(2)
+  integer :: ierr
+  integer :: whalo = 2, ehalo = 2, shalo = 2, nhalo = 2
+
+  call mpp_init(test_level=mpp_init_test_requests_allocated)
+  call mpp_domains_init(MPP_DEBUG)
+  call mpp_io_init()
+  call mpp_domains_set_stack_size(stackmax)
+
+  pe = mpp_pe()
+  npes = mpp_npes()
+
+  call test_global_sum_ad_r4( 'Simple')
+  call test_global_sum_ad_r4( 'Cyclic symmetry center')
+
+  call test_global_sum_ad_r8( 'Simple')
+  call test_global_sum_ad_r8( 'Cyclic symmetry center')
+
+  call test_global_sum_ad_i4( 'Simple')
+  call test_global_sum_ad_i4( 'Cyclic symmetry center')
+
+  call test_global_sum_ad_i8( 'Simple')
+  call test_global_sum_ad_i8( 'Cyclic symmetry center')
+
+
+  call MPI_finalize(ierr)
+
+contains
+
+  !> test the 32-bit real global_sum_ad interfaces
+  subroutine test_global_sum_ad_r4 (domain_type)
+    character(len=*), intent(in) :: domain_type ! type of mpp domain to use
+    ! local
+    real(r4_kind) :: gsum_tl, gsum_ad
+    real(r4_kind) :: gsum_tl_save, gsum_ad_save
+    real(r4_kind) :: gsum_tl_bit, gsum_ad_bit
+    real(r4_kind) :: gsum_tl_save_bit, gsum_ad_save_bit
+    integer :: i,j,k, ishift, jshift, position
+    integer :: isd, ied, jsd, jed
+
+    type(domain2D) :: domain
+    real(r4_kind), allocatable, dimension(:,:) :: x2, x2_ad, x2_ad_bit
+    real(r4_kind), allocatable, dimension(:,:,:) :: x3, x3_ad, x3_ad_bit
+    real(r4_kind), allocatable, dimension(:,:,:,:) :: x4, x4_ad, x4_ad_bit
+    real(r4_kind), allocatable, dimension(:,:,:,:,:) :: x5, x5_ad, x5_ad_bit
+
+    call generate_domain(domain, domain_type)
+
+    call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+
+    position = CENTER
+
+    ! test the 2D arrays
+    allocate( x2(isd:ied,jsd:jed), x2_ad(isd:ied,jsd:jed), x2_ad_bit(isd:ied,jsd:jed) )
+
+    x2=0.0
+    do j = jsd, jed
+      do i = isd, ied
+        x2(i,j) = real(i+j, kind=r4_kind)
+      enddo
+    enddo
+
+    gsum_ad = 0.0
+    gsum_tl = 0.0
+    gsum_tl_save = 0.0
+    gsum_tl_bit = 0.0
+    gsum_tl_save_bit = 0.0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x2, position = position  )
+    gsum_tl_bit  = mpp_global_sum( domain, x2, flags=BITWISE_EXACT_SUM  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_tl_save_bit = gsum_tl_bit*gsum_tl_bit
+
+    gsum_ad      = gsum_tl
+    gsum_ad_bit  = gsum_tl_bit
+
+    x2_ad     = 0.
+    x2_ad_bit = 0.
+    ! adjoint sum of global field
+    call mpp_global_sum_ad( domain, x2_ad, gsum_ad, position = position )
+    call mpp_global_sum_ad( domain, x2_ad_bit, gsum_ad_bit, flags = BITWISE_EXACT_SUM )
+
+    gsum_ad_save     = 0.
+    gsum_ad_save_bit = 0.
+    ! sum the original global sum and the adjoint global sum
+    do j = jsd, jed
+      do i = isd, ied
+        gsum_ad_save     = gsum_ad_save + x2_ad(i,j)*x2(i,j)
+        gsum_ad_save_bit = gsum_ad_save_bit + x2_ad_bit(i,j)*x2(i,j)
+      enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+    call mpp_sum( gsum_ad_save_bit )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (abs((gsum_ad_save-gsum_tl_save)/gsum_tl_save).lt.1E-7) then
+        print*, "2D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r4"
+      endif
+      if (abs((gsum_ad_save_bit-gsum_tl_save_bit)/gsum_tl_save_bit).lt.1E-7) then
+        print*, "2D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r4,"//&
+                "flags=BITWISE_EXACT_SUM"
+      endif
+    endif
+    deallocate(x2, x2_ad, x2_ad_bit)
+
+    ! test 3D arrays
+    allocate( x3(isd:ied,jsd:jed,nz), x3_ad(isd:ied,jsd:jed,nz), x3_ad_bit(isd:ied,jsd:jed,nz) )
+
+    x3=0.0
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x3(i,j,k) = real(i+j+k, kind=r4_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0.0
+    gsum_tl = 0.0
+    gsum_tl_save = 0.0
+    gsum_tl_bit = 0.0
+    gsum_tl_save_bit = 0.0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x3, position = position  )
+    gsum_tl_bit  = mpp_global_sum( domain, x3, flags=BITWISE_EXACT_SUM  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_tl_save_bit = gsum_tl_bit*gsum_tl_bit
+
+    gsum_ad      = gsum_tl
+    gsum_ad_bit  = gsum_tl_bit
+
+    x3_ad     = 0.
+    x3_ad_bit = 0.
+    ! adjoint sum of global field
+    call mpp_global_sum_ad( domain, x3_ad, gsum_ad, position = position )
+    call mpp_global_sum_ad( domain, x3_ad_bit, gsum_ad_bit, flags = BITWISE_EXACT_SUM )
+
+    gsum_ad_save     = 0.
+    gsum_ad_save_bit = 0.
+    ! sum the the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save     = gsum_ad_save + x3_ad(i,j,k)*x3(i,j,k)
+           gsum_ad_save_bit = gsum_ad_save_bit + x3_ad_bit(i,j,k)*x3(i,j,k)
+         enddo
+       enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+    call mpp_sum( gsum_ad_save_bit )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (abs((gsum_ad_save-gsum_tl_save)/gsum_tl_save).lt.1E-7) then
+        print*, "3D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r4"
+      endif
+      if (abs((gsum_ad_save_bit-gsum_tl_save_bit)/gsum_tl_save_bit).lt.1E-7) then
+        print*, "3D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r4,"//&
+                "flags=BITWISE_EXACT_SUM"
+      endif
+    endif
+
+    deallocate(x3, x3_ad, x3_ad_bit)
+
+    ! test 4D arrays
+    allocate( x4(isd:ied,jsd:jed,nz,1), x4_ad(isd:ied,jsd:jed,nz,1), x4_ad_bit(isd:ied,jsd:jed,nz,1) )
+
+    x4=0.0
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x4(i,j,k,1) = real(i+j+k, kind=r4_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0.0
+    gsum_tl = 0.0
+    gsum_tl_save = 0.0
+    gsum_tl_bit = 0.0
+    gsum_tl_save_bit = 0.0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x4, position = position  )
+    gsum_tl_bit  = mpp_global_sum( domain, x4, flags=BITWISE_EXACT_SUM  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_tl_save_bit = gsum_tl_bit*gsum_tl_bit
+
+    gsum_ad      = gsum_tl
+    gsum_ad_bit  = gsum_tl_bit
+
+    x4_ad     = 0.
+    x4_ad_bit = 0.
+    ! adjoint sum of global field
+    call mpp_global_sum_ad( domain, x4_ad, gsum_ad, position = position )
+    call mpp_global_sum_ad( domain, x4_ad_bit, gsum_ad_bit, flags = BITWISE_EXACT_SUM )
+
+    gsum_ad_save     = 0.
+    gsum_ad_save_bit = 0.
+    ! sum the the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save     = gsum_ad_save + x4_ad(i,j,k,1)*x4(i,j,k,1)
+           gsum_ad_save_bit = gsum_ad_save_bit + x4_ad_bit(i,j,k,1)*x4(i,j,k,1)
+         enddo
+       enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+    call mpp_sum( gsum_ad_save_bit )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (abs((gsum_ad_save-gsum_tl_save)/gsum_tl_save).lt.1E-7) then
+        print*, "4d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r4"
+      endif
+      if (abs((gsum_ad_save_bit-gsum_tl_save_bit)/gsum_tl_save_bit).lt.1E-7) then
+        print*, "4d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r4,"//&
+                "flags=BITWISE_EXACT_SUM"
+      endif
+    endif
+
+    deallocate(x4, x4_ad, x4_ad_bit)
+
+    ! test 5D arrays
+    allocate( x5(isd:ied,jsd:jed,nz,1,1), x5_ad(isd:ied,jsd:jed,nz,1,1), &
+              x5_ad_bit(isd:ied,jsd:jed,nz,1,1) )
+
+    x5=0.0
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x5(i,j,k,1,1) = real(i+j+k, kind=r4_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0.0
+    gsum_tl = 0.0
+    gsum_tl_save = 0.0
+    gsum_tl_bit = 0.0
+    gsum_tl_save_bit = 0.0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x5, position = position  )
+    gsum_tl_bit  = mpp_global_sum( domain, x5, flags=BITWISE_EXACT_SUM  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_tl_save_bit = gsum_tl_bit*gsum_tl_bit
+    gsum_ad      = gsum_tl
+    gsum_ad_bit  = gsum_tl_bit
+
+    x5_ad     = 0.
+    x5_ad_bit = 0.
+    ! adjoint sum of the global field
+    call mpp_global_sum_ad( domain, x5_ad, gsum_ad, position = position )
+    call mpp_global_sum_ad( domain, x5_ad_bit, gsum_ad_bit, flags = BITWISE_EXACT_SUM )
+
+    gsum_ad_save     = 0.
+    gsum_ad_save_bit = 0.
+    ! sum the the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save     = gsum_ad_save + x5_ad(i,j,k,1,1)*x5(i,j,k,1,1)
+           gsum_ad_save_bit = gsum_ad_save_bit + x5_ad_bit(i,j,k,1,1)*x5(i,j,k,1,1)
+         enddo
+       enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+    call mpp_sum( gsum_ad_save_bit )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (abs((gsum_ad_save-gsum_tl_save)/gsum_tl_save).lt.1E-7) then
+        print*, "5d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r4"
+      endif
+      if (abs((gsum_ad_save_bit-gsum_tl_save_bit)/gsum_tl_save_bit).lt.1E-7) then
+        print*, "5d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r4,"//&
+                "flags=BITWISE_EXACT_SUM"
+      endif
+    endif
+
+    deallocate(x5, x5_ad, x5_ad_bit)
+
+  end subroutine test_global_sum_ad_r4
+
+  !> test 64-bit real global_sum_ad interfaces
+  subroutine test_global_sum_ad_r8 (domain_type)
+    character(len=*), intent(in) :: domain_type ! type of mpp domain to use
+    ! local
+    type(domain2D) :: domain
+    real(r8_kind) :: gsum_tl, gsum_ad
+    real(r8_kind) :: gsum_tl_save, gsum_ad_save
+    real(r8_kind) :: gsum_tl_bit, gsum_ad_bit
+    real(r8_kind) :: gsum_tl_save_bit, gsum_ad_save_bit
+    integer :: i,j,k, ishift, jshift, position
+    integer :: isd, ied, jsd, jed
+
+    real(r8_kind), allocatable, dimension(:,:) :: x2, x2_ad, x2_ad_bit
+    real(r8_kind), allocatable, dimension(:,:,:) :: x3, x3_ad, x3_ad_bit
+    real(r8_kind), allocatable, dimension(:,:,:,:) :: x4, x4_ad, x4_ad_bit
+    real(r8_kind), allocatable, dimension(:,:,:,:,:) :: x5, x5_ad, x5_ad_bit
+
+    call generate_domain(domain, domain_type)
+    call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+    position = CENTER
+
+    ! test the 2D arrays
+    allocate( x2(isd:ied,jsd:jed), x2_ad(isd:ied,jsd:jed), x2_ad_bit(isd:ied,jsd:jed) )
+
+    x2=0.0
+    do j = jsd, jed
+      do i = isd, ied
+        x2(i,j) = real(i+j, kind=r8_kind)
+      enddo
+    enddo
+
+    gsum_ad = 0.0
+    gsum_tl = 0.0
+    gsum_tl_save = 0.0
+    gsum_tl_bit = 0.0
+    gsum_tl_save_bit = 0.0
+
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x2, position = position  )
+    gsum_tl_bit  = mpp_global_sum( domain, x2, flags=BITWISE_EXACT_SUM  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_tl_save_bit = gsum_tl_bit*gsum_tl_bit
+
+    gsum_ad      = gsum_tl
+    gsum_ad_bit  = gsum_tl_bit
+
+    x2_ad     = 0.
+    x2_ad_bit = 0.
+    ! adjoint sum of the global field
+    call mpp_global_sum_ad( domain, x2_ad, gsum_ad, position = position )
+    call mpp_global_sum_ad( domain, x2_ad_bit, gsum_ad_bit, flags = BITWISE_EXACT_SUM )
+
+    gsum_ad_save     = 0.
+    gsum_ad_save_bit = 0.
+    ! sum the the original global sum and the adjoint global sum
+    do j = jsd, jed
+      do i = isd, ied
+        gsum_ad_save     = gsum_ad_save + x2_ad(i,j)*x2(i,j)
+        gsum_ad_save_bit = gsum_ad_save_bit + x2_ad_bit(i,j)*x2(i,j)
+      enddo
+    enddo
+
+    call mpp_sum( gsum_ad_save )
+    call mpp_sum( gsum_ad_save_bit )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (abs((gsum_ad_save-gsum_tl_save)/gsum_tl_save).lt.1E-7) then
+        print*, "2D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r8"
+      endif
+      if (abs((gsum_ad_save_bit-gsum_tl_save_bit)/gsum_tl_save_bit).lt.1E-7) then
+        print*, "2D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r8,"//&
+                "flags=BITWISE_EXACT_SUM"
+      endif
+    endif
+    deallocate(x2, x2_ad, x2_ad_bit)
+
+    ! test 3D arrays
+    allocate( x3(isd:ied,jsd:jed,nz), x3_ad(isd:ied,jsd:jed,nz), x3_ad_bit(isd:ied,jsd:jed,nz) )
+
+    x3=0.
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x3(i,j,k) = real(i+j+k, kind=r8_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0.0
+    gsum_tl = 0.0
+    gsum_tl_save = 0.0
+    gsum_tl_bit = 0.0
+    gsum_tl_save_bit = 0.0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x3, position = position  )
+    gsum_tl_bit  = mpp_global_sum( domain, x3, flags=BITWISE_EXACT_SUM  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_tl_save_bit = gsum_tl_bit*gsum_tl_bit
+
+    gsum_ad      = gsum_tl
+    gsum_ad_bit  = gsum_tl_bit
+
+    x3_ad     = 0.
+    x3_ad_bit = 0.
+    ! adjoint sum of the global field
+    call mpp_global_sum_ad( domain, x3_ad, gsum_ad, position = position )
+    call mpp_global_sum_ad( domain, x3_ad_bit, gsum_ad_bit, flags = BITWISE_EXACT_SUM )
+
+    gsum_ad_save     = 0.
+    gsum_ad_save_bit = 0.
+    ! sum of the global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save     = gsum_ad_save + x3_ad(i,j,k)*x3(i,j,k)
+           gsum_ad_save_bit = gsum_ad_save_bit + x3_ad_bit(i,j,k)*x3(i,j,k)
+         enddo
+       enddo
+    enddo
+    ! sum across all pes
+    call mpp_sum( gsum_ad_save )
+    call mpp_sum( gsum_ad_save_bit )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (abs((gsum_ad_save-gsum_tl_save)/gsum_tl_save).lt.1E-7) then
+        print*, "3D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r8"
+      endif
+      if (abs((gsum_ad_save_bit-gsum_tl_save_bit)/gsum_tl_save_bit).lt.1E-7) then
+        print*, "3D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r8,"//&
+                "flags=BITWISE_EXACT_SUM"
+      endif
+    endif
+
+    deallocate(x3, x3_ad, x3_ad_bit)
+
+    ! test 4D arrays
+    allocate( x4(isd:ied,jsd:jed,nz,1), x4_ad(isd:ied,jsd:jed,nz,1), x4_ad_bit(isd:ied,jsd:jed,nz,1) )
+
+    x4=0.
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x4(i,j,k,1) = real(i+j+k, kind=r8_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0.0
+    gsum_tl = 0.0
+    gsum_tl_save = 0.0
+    gsum_tl_bit = 0.0
+    gsum_tl_save_bit = 0.0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x4, position = position  )
+    gsum_tl_bit  = mpp_global_sum( domain, x4, flags=BITWISE_EXACT_SUM  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_tl_save_bit = gsum_tl_bit*gsum_tl_bit
+
+    gsum_ad      = gsum_tl
+    gsum_ad_bit  = gsum_tl_bit
+
+    x4_ad     = 0.
+    x4_ad_bit = 0.
+    ! ajoint sum of the global field
+    call mpp_global_sum_ad( domain, x4_ad, gsum_ad, position = position )
+    call mpp_global_sum_ad( domain, x4_ad_bit, gsum_ad_bit, flags = BITWISE_EXACT_SUM )
+
+    gsum_ad_save     = 0.
+    gsum_ad_save_bit = 0.
+    ! sum of the adjoint global sum and the original global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save     = gsum_ad_save + x4_ad(i,j,k,1)*x4(i,j,k,1)
+           gsum_ad_save_bit = gsum_ad_save_bit + x4_ad_bit(i,j,k,1)*x4(i,j,k,1)
+         enddo
+       enddo
+    enddo
+    ! sum across all pes
+    call mpp_sum( gsum_ad_save )
+    call mpp_sum( gsum_ad_save_bit )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (abs((gsum_ad_save-gsum_tl_save)/gsum_tl_save).lt.1E-7) then
+        print*, "4d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r8"
+      endif
+      if (abs((gsum_ad_save_bit-gsum_tl_save_bit)/gsum_tl_save_bit).lt.1E-7) then
+        print*, "4d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r8,"//&
+                "flags=BITWISE_EXACT_SUM"
+      endif
+    endif
+
+    deallocate(x4, x4_ad)
+
+    ! test 5D arrays
+    allocate( x5(isd:ied,jsd:jed,nz,1,1), x5_ad(isd:ied,jsd:jed,nz,1,1), &
+              x5_ad_bit(isd:ied,jsd:jed,nz,1,1) )
+
+    x5=0.
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x5(i,j,k,1,1) = real(i+j+k, kind=r8_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0.0
+    gsum_tl = 0.0
+    gsum_tl_save = 0.0
+    gsum_tl_bit = 0.0
+    gsum_tl_save_bit = 0.0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x5, position = position  )
+    gsum_tl_bit  = mpp_global_sum( domain, x5, flags=BITWISE_EXACT_SUM  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_tl_save_bit = gsum_tl_bit*gsum_tl_bit
+
+    gsum_ad      = gsum_tl
+    gsum_ad_bit  = gsum_tl_bit
+
+    x5_ad     = 0.
+    x5_ad_bit = 0.
+    ! global adjoint sum
+    call mpp_global_sum_ad( domain, x5_ad, gsum_ad, position = position )
+    call mpp_global_sum_ad( domain, x5_ad_bit, gsum_ad_bit, flags = BITWISE_EXACT_SUM )
+
+    gsum_ad_save     = 0.
+    gsum_ad_save_bit = 0.
+    ! sum of the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save     = gsum_ad_save + x5_ad(i,j,k,1,1)*x5(i,j,k,1,1)
+           gsum_ad_save_bit = gsum_ad_save_bit + x5_ad_bit(i,j,k,1,1)*x5(i,j,k,1,1)
+         enddo
+       enddo
+    enddo
+    ! sum across all pes
+    call mpp_sum( gsum_ad_save )
+    call mpp_sum( gsum_ad_save_bit )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (abs((gsum_ad_save-gsum_tl_save)/gsum_tl_save).lt.1E-7) then
+        print*, "5d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r8"
+      endif
+      if (abs((gsum_ad_save_bit-gsum_tl_save_bit)/gsum_tl_save_bit).lt.1E-7) then
+        print*, "5d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_r8,"//&
+                "flags=BITWISE_EXACT_SUM"
+      endif
+    endif
+
+    deallocate(x5, x5_ad, x5_ad_bit)
+
+  end subroutine test_global_sum_ad_r8
+
+  !> test the 32-bit integer global_sum_ad interfaces
+  subroutine test_global_sum_ad_i4 (domain_type)
+    character(len=*), intent(in) :: domain_type ! type of mpp domain to use
+    ! local
+    integer(i4_kind) :: gsum_tl, gsum_ad
+    integer(i4_kind) :: gsum_tl_save, gsum_ad_save
+    integer :: i,j,k, ishift, jshift, position
+    integer :: isd, ied, jsd, jed
+
+    type(domain2D) :: domain
+    integer(i4_kind), allocatable, dimension(:,:) :: x2, x2_ad
+    integer(i4_kind), allocatable, dimension(:,:,:) :: x3, x3_ad
+    integer(i4_kind), allocatable, dimension(:,:,:,:) :: x4, x4_ad
+    integer(i4_kind), allocatable, dimension(:,:,:,:,:) :: x5, x5_ad
+
+    call generate_domain(domain, domain_type)
+
+    call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+
+    position = CENTER
+
+    ! test the 2D arrays
+    allocate( x2(isd:ied,jsd:jed), x2_ad(isd:ied,jsd:jed))
+
+    x2=0
+    do j = jsd, jed
+      do i = isd, ied
+        x2(i,j) = int(i+j, kind=i4_kind)
+      enddo
+    enddo
+
+    gsum_ad = 0
+    gsum_tl = 0
+    gsum_tl_save = 0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x2, position = position  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_ad      = gsum_tl
+
+    x2_ad     = 0
+
+    ! adjoint sum of global field
+    call mpp_global_sum_ad( domain, x2_ad, gsum_ad, position = position )
+
+    gsum_ad_save     = 0
+    ! sum the original global sum and the adjoint global sum
+    do j = jsd, jed
+      do i = isd, ied
+        gsum_ad_save     = gsum_ad_save + x2_ad(i,j)*x2(i,j)
+      enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (gsum_ad_save .eq. gsum_tl_save) then
+        print*, "2D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_i4"
+      endif
+    endif
+    deallocate(x2, x2_ad)
+
+    ! test 3D arrays
+    allocate( x3(isd:ied,jsd:jed,nz), x3_ad(isd:ied,jsd:jed,nz))
+
+    x3 = 0
+    do k = 1,nz
+       do j = jsd,jed
+         do i = isd, ied
+           x3(i,j,k) = int(i+j+k, kind=i4_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0
+    gsum_tl = 0
+    gsum_tl_save = 0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x3, position = position  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_ad      = gsum_tl
+
+    x3_ad     = 0
+    ! adjoint sum of global field
+    call mpp_global_sum_ad( domain, x3_ad, gsum_ad, position = position )
+
+    gsum_ad_save     = 0
+    ! sum the the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save     = gsum_ad_save + x3_ad(i,j,k)*x3(i,j,k)
+         enddo
+       enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (gsum_ad_save .eq. gsum_tl_save) then
+        print*, "3D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_i4"
+      endif
+    endif
+
+    deallocate(x3, x3_ad)
+
+    ! test 4D arrays
+    allocate( x4(isd:ied,jsd:jed,nz,1), x4_ad(isd:ied,jsd:jed,nz,1))
+
+    x4=0
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x4(i,j,k,1) = int(i+j+k, kind=i4_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0
+    gsum_tl = 0
+    gsum_tl_save = 0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x4, position = position  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_ad      = gsum_tl
+
+    x4_ad     = 0
+    ! adjoint sum of global field
+    call mpp_global_sum_ad( domain, x4_ad, gsum_ad, position = position )
+    gsum_ad_save     = 0
+    ! sum the the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save = gsum_ad_save + x4_ad(i,j,k,1)*x4(i,j,k,1)
+         enddo
+       enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (gsum_ad_save .eq. gsum_tl_save) then
+        print*, "4d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_i4"
+      endif
+    endif
+
+    deallocate(x4, x4_ad)
+
+    ! test 5D arrays
+    allocate( x5(isd:ied,jsd:jed,nz,1,1), x5_ad(isd:ied,jsd:jed,nz,1,1))
+
+    x5=0
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x5(i,j,k,1,1) = int(i+j+k, kind=i4_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0
+    gsum_tl = 0
+    gsum_tl_save = 0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x5, position = position  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+    gsum_ad      = gsum_tl
+
+    x5_ad     = 0
+    ! adjoint sum of the global field
+    call mpp_global_sum_ad( domain, x5_ad, gsum_ad, position = position )
+
+    gsum_ad_save     = 0
+    ! sum the the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save = gsum_ad_save + x5_ad(i,j,k,1,1)*x5(i,j,k,1,1)
+         enddo
+       enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (gsum_ad_save .eq. gsum_tl_save) then
+        print*, "5d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_i4"
+      endif
+    endif
+
+    deallocate(x5, x5_ad)
+
+  end subroutine test_global_sum_ad_i4
+
+  !> test the 64-bit integer global_sum_ad interfaces
+  subroutine test_global_sum_ad_i8 (domain_type)
+    character(len=*), intent(in) :: domain_type ! type of mpp domain to use
+    ! local
+    integer(i8_kind) :: gsum_tl, gsum_ad
+    integer(i8_kind) :: gsum_tl_save, gsum_ad_save
+    integer :: i,j,k, ishift, jshift, position
+    integer :: isd, ied, jsd, jed
+
+    type(domain2D) :: domain
+    integer(i8_kind), allocatable, dimension(:,:) :: x2, x2_ad
+    integer(i8_kind), allocatable, dimension(:,:,:) :: x3, x3_ad
+    integer(i8_kind), allocatable, dimension(:,:,:,:) :: x4, x4_ad
+    integer(i8_kind), allocatable, dimension(:,:,:,:,:) :: x5, x5_ad
+
+    call generate_domain(domain, domain_type)
+
+    call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+
+    position = CENTER
+
+    ! test the 2D arrays
+    allocate( x2(isd:ied,jsd:jed), x2_ad(isd:ied,jsd:jed))
+
+    x2=0
+    do j = jsd, jed
+      do i = isd, ied
+        x2(i,j) = int(i+j, kind=i8_kind)
+      enddo
+    enddo
+
+    gsum_ad = 0
+    gsum_tl = 0
+    gsum_tl_save = 0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x2, position = position  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_ad      = gsum_tl
+
+    x2_ad     = 0
+
+    ! adjoint sum of global field
+    call mpp_global_sum_ad( domain, x2_ad, gsum_ad, position = position )
+
+    gsum_ad_save     = 0
+    ! sum the original global sum and the adjoint global sum
+    do j = jsd, jed
+      do i = isd, ied
+        gsum_ad_save     = gsum_ad_save + x2_ad(i,j)*x2(i,j)
+      enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (gsum_ad_save .eq. gsum_tl_save) then
+        print*, "2D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_i8"
+      endif
+    endif
+    deallocate(x2, x2_ad)
+
+    ! test 3D arrays
+    allocate( x3(isd:ied,jsd:jed,nz), x3_ad(isd:ied,jsd:jed,nz))
+
+    x3=0
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x3(i,j,k) = int(i+j+k, kind=i8_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_tl = 0
+    gsum_ad = 0
+    gsum_tl_save = 0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x3, position = position  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_ad      = gsum_tl
+
+    x3_ad     = 0
+    ! adjoint sum of global field
+    call mpp_global_sum_ad( domain, x3_ad, gsum_ad, position = position )
+
+    gsum_ad_save     = 0
+    ! sum the the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save     = gsum_ad_save + x3_ad(i,j,k)*x3(i,j,k)
+         enddo
+       enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (gsum_ad_save .eq. gsum_tl_save) then
+        print*, "3D arrays Passed Adjoint Dot Test: mpp_global_sum_ad_i8"
+      endif
+    endif
+
+    deallocate(x3, x3_ad)
+
+    ! test 4D arrays
+    allocate( x4(isd:ied,jsd:jed,nz,1), x4_ad(isd:ied,jsd:jed,nz,1))
+
+    x4=0
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x4(i,j,k,1) = int(i+j+k, kind=i8_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0
+    gsum_tl = 0
+    gsum_tl_save = 0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x4, position = position  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+
+    gsum_ad      = gsum_tl
+
+    x4_ad     = 0
+    ! adjoint sum of global field
+    call mpp_global_sum_ad( domain, x4_ad, gsum_ad, position = position )
+    gsum_ad_save     = 0
+    ! sum the the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save     = gsum_ad_save + x4_ad(i,j,k,1)*x4(i,j,k,1)
+         enddo
+       enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (gsum_ad_save .eq. gsum_tl_save) then
+        print*, "4d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_i8"
+      endif
+    endif
+
+    deallocate(x4, x4_ad)
+
+    ! test 5D arrays
+    allocate( x5(isd:ied,jsd:jed,nz,1,1), x5_ad(isd:ied,jsd:jed,nz,1,1))
+
+    x5=0
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           x5(i,j,k,1,1) = int(i+j+k, kind=i8_kind)
+         enddo
+       enddo
+    enddo
+
+    gsum_ad = 0
+    gsum_tl = 0
+    gsum_tl_save = 0
+    ! global sum of the domain-decomposed array
+    gsum_tl      = mpp_global_sum( domain, x5, position = position  )
+
+    gsum_tl_save = gsum_tl*gsum_tl
+    gsum_ad      = gsum_tl
+
+    x5_ad     = 0
+    ! adjoint sum of the global field
+    call mpp_global_sum_ad( domain, x5_ad, gsum_ad, position = position )
+
+    gsum_ad_save     = 0
+    ! sum the the original global sum and the adjoint global sum
+    do k = 1,nz
+       do j = jsd, jed
+         do i = isd, ied
+           gsum_ad_save = gsum_ad_save + x5_ad(i,j,k,1,1)*x5(i,j,k,1,1)
+         enddo
+       enddo
+    enddo
+    ! sum across the pes
+    call mpp_sum( gsum_ad_save )
+
+    pe = mpp_pe()
+    if( pe.EQ.mpp_root_pe() ) then
+      if (gsum_ad_save .eq. gsum_tl_save) then
+        print*, "5d arrays Passed Adjoint Dot Test: mpp_global_sum_ad_i8"
+      endif
+    endif
+
+    deallocate(x5, x5_ad)
+
+  end subroutine test_global_sum_ad_i8
+
+  !> define the 2D test domain
+  subroutine generate_domain(domain, domain_type)
+    type(domain2D), intent(inout) :: domain ! 2D mpp domain
+    character(len=*), intent(in) :: domain_type ! type of domain to generate
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+
+    select case(trim(domain_type))
+      case( 'Simple' )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, &
+             ehalo=ehalo, shalo=shalo, nhalo=nhalo, name=domain_type )
+      case( 'Cyclic symmetry center')
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, &
+             ehalo=ehalo, shalo=shalo, nhalo=nhalo, name=domain_type, symmetry = .true., &
+             xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN )
+      case default
+        call mpp_error( FATAL, 'test_mpp_global_sum_ad: no such test: '//trim(domain_type))
+    end select
+  end subroutine generate_domain
+
+end program test_mpp_global_sum_ad
diff --git a/test_fms/mpp/test_mpp_global_sum_ad.sh b/test_fms/mpp/test_mpp_global_sum_ad.sh
new file mode 100755
index 0000000000..3fb8d4c317
--- /dev/null
+++ b/test_fms/mpp/test_mpp_global_sum_ad.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Jessica Liptak
+
+# Set common test settings.
+. ../test_common.sh
+
+skip_test="no"
+
+# Get the number of available CPUs on the system
+if [ $(command -v nproc) ]
+then
+    # Looks like a linux system
+    nProc=$(nproc)
+elif [ $(command -v sysctl) ]
+then
+    # Looks like a Mac OS X system
+    nProc=$(sysctl -n hw.physicalcpu)
+else
+    nProc=-1
+fi
+
+# Do we need to oversubscribe
+if [ ${nProc} -lt 0 ]
+then
+    # Couldn't get the number of CPUs, skip the test.
+    skip_test="skip"
+elif [ $nProc -ge 4 ]
+then
+    # Run the test
+    run_test test_mpp_global_sum_ad 4 $skip_test
+fi
+
diff --git a/test_fms/mpp/test_mpp_mem_dump b/test_fms/mpp/test_mpp_mem_dump
new file mode 100755
index 0000000000..7392962a95
--- /dev/null
+++ b/test_fms/mpp/test_mpp_mem_dump
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_mem_dump - temporary wrapper script for .libs/test_mpp_mem_dump
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_mem_dump program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_mem_dump:test_mpp_mem_dump:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_mem_dump:test_mpp_mem_dump:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_mem_dump:test_mpp_mem_dump:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_mem_dump'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_memuse b/test_fms/mpp/test_mpp_memuse
new file mode 100755
index 0000000000..a879a20572
--- /dev/null
+++ b/test_fms/mpp/test_mpp_memuse
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_memuse - temporary wrapper script for .libs/test_mpp_memuse
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_memuse program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_memuse:test_mpp_memuse:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_memuse:test_mpp_memuse:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_memuse:test_mpp_memuse:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_memuse'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_memutils_begin_2x b/test_fms/mpp/test_mpp_memutils_begin_2x
new file mode 100755
index 0000000000..2fab291aad
--- /dev/null
+++ b/test_fms/mpp/test_mpp_memutils_begin_2x
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_memutils_begin_2x - temporary wrapper script for .libs/test_mpp_memutils_begin_2x
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_memutils_begin_2x program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_memutils_begin_2x:test_mpp_memutils_begin_2x:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_memutils_begin_2x:test_mpp_memutils_begin_2x:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_memutils_begin_2x:test_mpp_memutils_begin_2x:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_memutils_begin_2x'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_memutils_begin_end b/test_fms/mpp/test_mpp_memutils_begin_end
new file mode 100755
index 0000000000..4dd29a705a
--- /dev/null
+++ b/test_fms/mpp/test_mpp_memutils_begin_end
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_memutils_begin_end - temporary wrapper script for .libs/test_mpp_memutils_begin_end
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_memutils_begin_end program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_memutils_begin_end:test_mpp_memutils_begin_end:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_memutils_begin_end:test_mpp_memutils_begin_end:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_memutils_begin_end:test_mpp_memutils_begin_end:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_memutils_begin_end'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_memutils_end_before_begin b/test_fms/mpp/test_mpp_memutils_end_before_begin
new file mode 100755
index 0000000000..764dce4e6b
--- /dev/null
+++ b/test_fms/mpp/test_mpp_memutils_end_before_begin
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_memutils_end_before_begin - temporary wrapper script for .libs/test_mpp_memutils_end_before_begin
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_memutils_end_before_begin program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_memutils_end_before_begin:test_mpp_memutils_end_before_begin:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_memutils_end_before_begin:test_mpp_memutils_end_before_begin:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_memutils_end_before_begin:test_mpp_memutils_end_before_begin:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_memutils_end_before_begin'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_npes b/test_fms/mpp/test_mpp_npes
new file mode 100755
index 0000000000..8ddc155eae
--- /dev/null
+++ b/test_fms/mpp/test_mpp_npes
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_npes - temporary wrapper script for .libs/test_mpp_npes
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_npes program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_npes:test_mpp_npes:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_npes:test_mpp_npes:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_npes:test_mpp_npes:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_npes'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_pe b/test_fms/mpp/test_mpp_pe
new file mode 100755
index 0000000000..a68116440e
--- /dev/null
+++ b/test_fms/mpp/test_mpp_pe
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_pe - temporary wrapper script for .libs/test_mpp_pe
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_pe program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_pe:test_mpp_pe:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_pe:test_mpp_pe:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_pe:test_mpp_pe:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_pe'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_print_memuse_stats_file b/test_fms/mpp/test_mpp_print_memuse_stats_file
new file mode 100755
index 0000000000..8c92c68f80
--- /dev/null
+++ b/test_fms/mpp/test_mpp_print_memuse_stats_file
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_print_memuse_stats_file - temporary wrapper script for .libs/test_mpp_print_memuse_stats_file
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_print_memuse_stats_file program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_print_memuse_stats_file:test_mpp_print_memuse_stats_file:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_print_memuse_stats_file:test_mpp_print_memuse_stats_file:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_print_memuse_stats_file:test_mpp_print_memuse_stats_file:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_print_memuse_stats_file'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_print_memuse_stats_stderr b/test_fms/mpp/test_mpp_print_memuse_stats_stderr
new file mode 100755
index 0000000000..de73398c9e
--- /dev/null
+++ b/test_fms/mpp/test_mpp_print_memuse_stats_stderr
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_print_memuse_stats_stderr - temporary wrapper script for .libs/test_mpp_print_memuse_stats_stderr
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_print_memuse_stats_stderr program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_print_memuse_stats_stderr:test_mpp_print_memuse_stats_stderr:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_print_memuse_stats_stderr:test_mpp_print_memuse_stats_stderr:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_print_memuse_stats_stderr:test_mpp_print_memuse_stats_stderr:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_print_memuse_stats_stderr'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_pset b/test_fms/mpp/test_mpp_pset
new file mode 100755
index 0000000000..dafa0b15bb
--- /dev/null
+++ b/test_fms/mpp/test_mpp_pset
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_pset - temporary wrapper script for .libs/test_mpp_pset
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_pset program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_pset:test_mpp_pset:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_pset:test_mpp_pset:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_pset:test_mpp_pset:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_pset'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_root_pe b/test_fms/mpp/test_mpp_root_pe
new file mode 100755
index 0000000000..ee862977d9
--- /dev/null
+++ b/test_fms/mpp/test_mpp_root_pe
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_root_pe - temporary wrapper script for .libs/test_mpp_root_pe
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_root_pe program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_root_pe:test_mpp_root_pe:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_root_pe:test_mpp_root_pe:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_root_pe:test_mpp_root_pe:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_root_pe'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_sendrecv.F90 b/test_fms/mpp/test_mpp_sendrecv.F90
new file mode 100644
index 0000000000..d6c315994e
--- /dev/null
+++ b/test_fms/mpp/test_mpp_sendrecv.F90
@@ -0,0 +1,632 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+#ifdef SYSTEM_CLOCK
+#undef SYSTEM_CLOCK
+#endif
+
+!> @author Miguel Zuniga
+!> @brief Test various mpp_send and mpp_recv routines.
+!> @note  The rain in spain stays mainly on the plain.
+!> @todo  Follow the white rabbit.
+program test_mpp_sendrecv
+
+#ifdef sgi_mipspro
+  use shmem_interface
+#endif
+
+  use mpp_mod, only : mpp_init, mpp_exit, mpp_pe, mpp_npes, mpp_root_pe, stdout
+  use mpp_mod, only : mpp_sync
+  use mpp_mod, only : mpp_declare_pelist, mpp_set_current_pelist, mpp_set_stack_size
+  use mpp_mod, only : mpp_send, mpp_recv, mpp_error, FATAL
+  use mpp_io_mod, only: mpp_io_init, mpp_flush
+  use mpp_mod, only : mpp_init_test_requests_allocated
+  use platform_mod
+
+#ifdef use_MPI_GSM
+  use mpp_mod, only : mpp_gsm_free
+#endif
+
+  implicit none
+
+  integer, parameter              :: n=1048576
+  real, allocatable, dimension(:) :: a, b, c
+#ifdef use_MPI_GSM
+  real                            :: d(n)
+  pointer (locd, d)
+#else
+  real, allocatable, dimension(:) :: d
+  integer(kind=i8_kind) :: locd
+#endif
+  integer                         :: pe, npes, root, istat
+  integer                         :: out_unit
+  real                            :: dt
+  integer                         :: ierr
+
+  call mpp_init(mpp_init_test_requests_allocated)
+  call mpp_io_init()
+  call mpp_set_stack_size(3145746)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  root = mpp_root_pe()
+  out_unit = stdout()
+
+  if( pe.EQ.root ) print *, '------------------> Calling test_sendrecv <------------------'
+  call test_sendrecv_2D(npes,pe,root,out_unit)
+  call test_sendrecv_3D(npes,pe,root,out_unit)
+  if( pe.EQ.root ) print *, '------------------> Finished test_sendrecv <------------------'
+
+  call MPI_finalize(ierr)
+
+contains
+
+!> @brief Call the type-specific test_sendrecv_2D routines.
+  subroutine test_sendrecv_2D(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    if(npes < 3)then
+       call mpp_error(FATAL, "Test_sendrecv_2D: minimum of 3 ranks required. Not testing gather; too few ranks.")
+    endif
+    write(out_unit,*)
+
+    call test_sendrecv_2D_R4(npes, pe, root, out_unit)
+
+    call test_sendrecv_2D_R8(npes, pe, root, out_unit)
+
+    call test_sendrecv_2D_I4(npes, pe, root, out_unit)
+
+    call test_sendrecv_2D_I8(npes, pe, root, out_unit)
+
+  end subroutine test_sendrecv_2D
+
+
+!> @brief Call the type-specific test_sendrecv_3D routines.
+  subroutine test_sendrecv_3D(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    if(npes < 3)then
+       call mpp_error(FATAL, "Test_sendrecv_3D: minimum of 3 ranks required. Not testing gather; too few ranks.")
+    endif
+    write(out_unit,*)
+
+    call test_sendrecv_3D_R4(npes, pe, root, out_unit)
+
+    call test_sendrecv_3D_R8(npes, pe, root, out_unit)
+
+    call test_sendrecv_3D_I4(npes, pe, root, out_unit)
+
+    call test_sendrecv_3D_I8(npes, pe, root, out_unit)
+
+  end subroutine test_sendrecv_3D
+
+
+  !> @brief Test together the 2D mpp_send and mpp_recv functions with 32-bit real data arguments.
+  subroutine test_sendrecv_2D_R4(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer :: i,j, p
+    real(kind=r4_kind), allocatable, dimension(:,:)  ::  data     !!Data to be sendrecved
+    integer :: DS
+
+    DS = 9
+    allocate(data(DS, DS))
+
+    !!The full PE list [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1.0
+    !! Re-initialize data on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4) is 34.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             data(i,j) = i*10.0 + j*1.0
+          enddo
+       enddo
+    endif
+
+    !! Send from the source pe all of the data to all other pes
+    !! And receive from all other pes
+    if ( pe == root ) then
+       do p = 1,npes-1
+          call mpp_send( data, DS* DS, p )
+       end do
+    else
+       call mpp_recv( data, DS * DS, 0 )
+    end if
+
+    call mpp_sync() ! Needed ?
+
+    !! Verify that the data was correctly transmitted.
+    if(ANY(pe == pelist(1:npes-1))) then
+       do j = 1, DS
+          do i = 1, DS
+             if (data(i,j) /= ( i*10.0 + j*1.0)) then
+                call mpp_error(FATAL, "Test sendrecv 2D R4 failed - basic copy area.")
+             endif
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+    write(out_unit,*) "Test test_sendrecv_2D_R4  successful ."
+
+  end subroutine test_sendrecv_2D_R4
+
+  !> @brief Test together the 2D mpp_send and mpp_recv functions with 64-bit real data arguments.
+  subroutine test_sendrecv_2D_R8(npes,pe,root,out_unit)
+      integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer :: i,j, p
+    real(kind=r8_kind), allocatable, dimension(:,:)  ::  data     !!Data to be sendrecved
+    integer :: DS
+
+    DS = 9
+    allocate(data(DS, DS))
+
+    !!The full PE list [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1.0
+    !! Re-initialize data on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4) is 34.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             data(i,j) = i*10.0 + j*1.0
+          enddo
+       enddo
+    endif
+
+    !! Send from the source pe all of the data to all other pes
+    !! And receive from all other pes
+    if ( pe == root ) then
+       do p = 1,npes-1
+          call mpp_send( data, DS* DS, p )
+       end do
+    else
+       call mpp_recv( data, DS * DS, 0 )
+    end if
+
+    call mpp_sync() ! Needed ?
+
+
+    !! Verify that the data was correctly transmitted.
+    if(ANY(pe == pelist(1:npes-1))) then
+       do j = 1, DS
+          do i = 1, DS
+             if (data(i,j) /= ( i*10.0 + j*1.0)) then
+                call mpp_error(FATAL, "Test sendrecv 2D R8 failed - basic copy area.")
+             endif
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+    write(out_unit,*) "Test test_sendrecv_2D_R8  successful ."
+
+  end subroutine test_sendrecv_2D_R8
+
+  !> @brief Test together the mpp_send and mpp_recv 3D functions with 32-bit real data arguments.
+  subroutine test_sendrecv_3D_R4(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer :: i,j,k, p
+    real(kind=r4_kind), allocatable, dimension(:,:,:)  ::  data     !!Data to be sendrecved
+    integer :: DS
+    integer :: iz, jz  !!The zeroth element to be sendrecved is at pos data(is+iz, js+jz)
+    integer :: is, ie, js, je !!The amount of data to be sendrecved is (ie - is)*(je - js)
+    integer :: NZ
+
+
+    NZ = 9 !! Depth of the square tube to be sendrecved.
+    DS = 8 !! DS should be less than 10 for the tests below to make sense.
+    allocate(data(DS, DS, NZ))
+
+    !!The full PE list is [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1.0
+    !! Re-initialize data  on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4,5) is 543.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             do k = 1,NZ
+                data(i,j, k) = k*100.0 + j*10.0 + i*1.0
+             enddo
+          enddo
+       enddo
+    endif
+
+
+    !! Send from the source pe all of the data to all other pes
+    !! And receive from all other pes
+    if ( pe == root ) then
+       do p = 1,npes-1
+          call mpp_send( data, DS* DS* NZ, p )
+       end do
+    else
+       call mpp_recv( data, DS * DS * NZ, 0 )
+    end if
+
+    call mpp_sync() ! Needed ?
+
+    !! Verify the transmitted data
+    if(ANY(pe == pelist(1:npes-1))) then
+       !!Note below row (id index of "data() equivalent or formula") changing fastest.
+       do k = 1,  NZ
+          do j = 1, DS
+             do i = 1, DS
+                if (data(i,j, k) /= ( k*100.0 + j*10.0 + i*1.0 )) then
+                   call mpp_error(FATAL, "Test sendrecv 3D R4 failed - basic copy area.")
+                endif
+             enddo
+          enddo
+       enddo
+    endif
+
+    call mpp_sync()
+
+    write(out_unit,*) "Test sendrecv 3D R4 successful."
+
+  end subroutine test_sendrecv_3D_R4
+
+
+  !> @brief Test together the 3D mpp_send and mpp_recv 3D functions with 64-bit real data arguments.
+  subroutine test_sendrecv_3D_R8(npes,pe,root,out_unit)
+        integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer :: i,j,k, p
+    real(kind=r8_kind), allocatable, dimension(:,:,:)  ::  data     !!Data to be sendrecved
+    integer :: DS
+    integer :: iz, jz  !!The zeroth element to be sendrecved is at pos data(is+iz, js+jz)
+    integer :: is, ie, js, je !!The amount of data to be sendrecved is (ie - is)*(je - js)
+    integer :: NZ
+
+
+    NZ = 9 !! Depth of the square tube to be sendrecved.
+    DS = 8 !! DS should be less than 10 for the tests below to make sense.
+    allocate(data(DS, DS, NZ))
+
+    !!The full PE list is [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1.0
+    !! Re-initialize data  on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4,5) is 543.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             do k = 1,NZ
+                data(i,j, k) = k*100.0 + j*10.0 + i*1.0
+             enddo
+          enddo
+       enddo
+    endif
+
+
+    !! Send from the source pe all of the data to all other pes
+    !! And receive from all other pes
+    if ( pe == root ) then
+       do p = 1,npes-1
+          call mpp_send( data, DS* DS* NZ, p )
+       end do
+    else
+       call mpp_recv( data, DS * DS * NZ, 0 )
+    end if
+
+    call mpp_sync() ! Needed ?
+
+    !! Verify the transmitted data
+    if(ANY(pe == pelist(1:npes-1))) then
+       !!Note below row (id index of "data() equivalent or formula") changing fastest.
+       do k = 1,  NZ
+          do j = 1, DS
+             do i = 1, DS
+                if (data(i,j, k) /= ( k*100.0 + j*10.0 + i*1.0 )) then
+                   call mpp_error(FATAL, "Test sendrecv 3D R8 failed - basic copy area.")
+                endif
+             enddo
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+
+    write(out_unit,*) "Test sendrecv 3D R8 successful."
+
+  end subroutine test_sendrecv_3D_R8
+
+  !> @brief Test together the 2D mpp_send and mpp_recv functions with 32-bit integer data arguments.
+  subroutine test_sendrecv_2D_I4(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer(kind=i4_kind) :: i,j
+    integer(kind=i4_kind), allocatable, dimension(:,:)  ::  data     !!Data to be sendrecved
+    integer :: DS, p
+
+    DS = 9
+    allocate(data(DS, DS))
+
+    !!The full PE list [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1
+    !! Re-initialize data on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4) is 34.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             data(i,j) = i*10 + j
+          enddo
+       enddo
+    endif
+
+    !! Send from the source pe all of the data to all other pes
+    !! And receive from all other pes
+    if ( pe == root ) then
+       do p = 1,npes-1
+          call mpp_send( data, DS* DS, p )
+       end do
+    else
+       call mpp_recv( data, DS * DS, 0 )
+    end if
+
+    call mpp_sync() ! Needed ?
+
+    !! Verify that the data was correctly transmitted.
+    if(ANY(pe == pelist(1:npes-1))) then
+       do j = 1, DS
+          do i = 1, DS
+             if (data(i,j) /= ( i * 10 + j  )) then
+                call mpp_error(FATAL, "Test sendrecv 2D I4 failed - basic copy area.")
+             endif
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+    write(out_unit,*) "Test test_sendrecv_2D_I4  successful ."
+
+  end subroutine test_sendrecv_2D_I4
+
+  !> @brief Test together the 2D mpp_send and mpp_recv functions with 64-bit integer data arguments.
+  subroutine test_sendrecv_2D_I8(npes,pe,root,out_unit)
+      integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer(kind=i8_kind) :: i,j
+    integer(kind=i8_kind), allocatable, dimension(:,:)  ::  data     !!Data to be sendrecved
+    integer :: DS, p
+
+    DS = 9
+    allocate(data(DS, DS))
+
+    !!The full PE list [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1
+    !! Re-initialize data on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4) is 34.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             data(i,j) = i*10 + j
+          enddo
+       enddo
+    endif
+
+    !! Send from the source pe all of the data to all other pes
+    !! And receive from all other pes
+    if ( pe == root ) then
+       do p = 1,npes-1
+          call mpp_send( data, DS* DS, p )
+       end do
+    else
+       call mpp_recv( data, DS * DS, 0 )
+    end if
+
+    call mpp_sync() ! Needed ?
+
+    !! Verify that the data was correctly transmitted.
+    if(ANY(pe == pelist(1:npes-1))) then
+       do j = 1, DS
+          do i = 1, DS
+             if (data(i,j) /= ( i * 10 + j  )) then
+                call mpp_error(FATAL, "Test sendrecv 2D I8 failed - basic copy area.")
+             endif
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+    write(out_unit,*) "Test test_sendrecv_2D_I8  successful ."
+
+  end subroutine test_sendrecv_2D_I8
+
+  !> @brief Test together the mpp_send and mpp_recv 3D functions with 32-bit integer data arguments.
+  subroutine test_sendrecv_3D_I4(npes,pe,root,out_unit)
+    integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer(kind=i4_kind) :: i,j,k
+    integer(kind=i4_kind), allocatable, dimension(:,:,:)  ::  data     !!Data to be sendrecved
+    integer :: DS
+    integer :: iz, jz  !!The zeroth element to be sendrecved is at pos data(is+iz, js+jz)
+    integer :: is, ie, js, je !!The amount of data to be sendrecved is (ie - is)*(je - js)
+    integer :: NZ, p
+
+
+    NZ = 9 !! Depth of the square tube to be sendrecved.
+    DS = 8 !! DS should be less than 10 for the tests below to make sense.
+    allocate(data(DS, DS, NZ))
+
+    !!The full PE list is [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1
+    !! Re-initialize data  on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4,5) is 543.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             do k = 1,NZ
+                data(i,j, k) = k*100 + j*10 + i
+             enddo
+          enddo
+       enddo
+    endif
+
+
+    !! Send from the source pe all of the data to all other pes
+    !! And receive from all other pes
+    if ( pe == root ) then
+       do p = 1,npes-1
+          call mpp_send( data, DS* DS* NZ, p )
+       end do
+    else
+       call mpp_recv( data, DS * DS * NZ, 0 )
+    end if
+
+    call mpp_sync() ! Needed ?
+
+    !! Verify the transmitted data
+    if(ANY(pe == pelist(1:npes-1))) then
+       !!Note below row (id index of "data() equivalent or formula") changing fastest.
+       do k = 1,  NZ
+          do j = 1, DS
+             do i = 1, DS
+                if (data(i,j, k) /= ( k * 100 + j*10 + i )) then
+                   call mpp_error(FATAL, "Test sendrecv 3D I4 failed - basic copy area.")
+                endif
+             enddo
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+
+    write(out_unit,*) "Test sendrecv 3D I4 successful."
+
+  end subroutine test_sendrecv_3D_I4
+
+  !> @brief Test together the 3D mpp_send and mpp_recv 3D functions with 64-bit integer data arguments.
+  subroutine test_sendrecv_3D_I8(npes,pe,root,out_unit)
+        integer, intent(in) :: npes,pe,root,out_unit
+
+    integer :: pelist(npes)
+    integer(kind=i8_kind) :: i,j,k
+    integer(kind=i8_kind), allocatable, dimension(:,:,:)  ::  data     !!Data to be sendrecved
+    integer :: DS
+    integer :: iz, jz  !!The zeroth element to be sendrecved is at pos data(is+iz, js+jz)
+    integer :: is, ie, js, je !!The amount of data to be sendrecved is (ie - is)*(je - js)
+    integer :: NZ, p
+
+
+    NZ = 9 !! Depth of the square tube to be sendrecved.
+    DS = 8 !! DS should be less than 10 for the tests below to make sense.
+    allocate(data(DS, DS, NZ))
+
+    !!The full PE list is [0, ...,npes-1]
+    do i=0,npes-1
+       pelist(i+1) = i
+    enddo
+
+    !!Initialize all data on all PEs
+    data = -1
+    !! Re-initialize data  on the root PE only.
+    !! Data is such that we can calculate what it should be with a Formula
+    !! using the indecies. E.g.. data(3,4,5) is 543.000, etc.
+    if (pe == root) then
+       do i = 1,DS
+          do j = 1,DS
+             do k = 1,NZ
+                data(i,j, k) = k*100 + j*10 + i
+             enddo
+          enddo
+       enddo
+    endif
+
+
+    !! Send from the source pe all of the data to all other pes
+    !! And receive from all other pes
+    if ( pe == root ) then
+       do p = 1,npes-1
+          call mpp_send( data, DS* DS* NZ, p )
+       end do
+    else
+       call mpp_recv( data, DS * DS * NZ, 0 )
+    end if
+
+    call mpp_sync() ! Needed ?
+
+    !! Verify the transmitted data
+    if(ANY(pe == pelist(1:npes-1))) then
+       !!Note below row (id index of "data() equivalent or formula") changing fastest.
+       do k = 1,  NZ
+          do j = 1, DS
+             do i = 1, DS
+                if (data(i,j, k) /= ( k * 100 + j*10 + i )) then
+                   call mpp_error(FATAL, "Test sendrecv 3D I8 failed - basic copy area.")
+                endif
+             enddo
+          enddo
+       enddo
+    endif
+
+    call mpp_sync() !
+
+    write(out_unit,*) "Test sendrecv 3D I8 successful."
+
+  end subroutine test_sendrecv_3D_I8
+
+end program test_mpp_sendrecv
diff --git a/test_fms/mpp/test_mpp_sendrecv.sh b/test_fms/mpp/test_mpp_sendrecv.sh
new file mode 100755
index 0000000000..a958e9474c
--- /dev/null
+++ b/test_fms/mpp/test_mpp_sendrecv.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Ed Hartnett 11/29/19
+
+# Set common test settings.
+. ../test_common.sh
+
+skip_test="no"
+
+# Get the number of available CPUs on the system
+if [ $(command -v nproc) ]
+then
+    # Looks like a linux system
+    nProc=$(nproc)
+elif [ $(command -v sysctl) ]
+then
+    # Looks like a Mac OS X system
+    nProc=$(sysctl -n hw.physicalcpu)
+else
+    nProc=-1
+fi
+
+# Do we need to oversubscribe
+if [ ${nProc} -lt 0 ]
+then
+    # Couldn't get the number of CPUs, skip the test.
+    skip_test="skip"
+elif [ $nProc -lt 4 ]
+then
+    # Need to oversubscribe the MPI
+    run_test test_mpp_sendrecv 4 $skip_test "true"
+fi
+
+touch input.nml
+run_test test_mpp_sendrecv 4 $skip_test
+
diff --git a/test_fms/mpp/test_mpp_sum b/test_fms/mpp/test_mpp_sum
new file mode 100755
index 0000000000..f89705faec
--- /dev/null
+++ b/test_fms/mpp/test_mpp_sum
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_sum - temporary wrapper script for .libs/test_mpp_sum
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_sum program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_sum:test_mpp_sum:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_sum:test_mpp_sum:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_sum:test_mpp_sum:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_sum'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_sum.F90 b/test_fms/mpp/test_mpp_sum.F90
new file mode 100644
index 0000000000..debd8a282c
--- /dev/null
+++ b/test_fms/mpp/test_mpp_sum.F90
@@ -0,0 +1,616 @@
+
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @file
+!! @author Lauren Chilutti
+!! @brief Test program for the mpp_sum interface.
+!! @email gfdl.climate.model.info@noaa.gov
+!! @description This test program is for testing the mpp_sum interface.
+
+program test_mpp_sum
+
+  use mpp_mod, only : mpp_init, mpp_pe, mpp_npes, mpp_root_pe
+  use mpp_mod, only : mpp_sync
+  use mpp_mod, only : mpp_set_stack_size, mpp_init_test_requests_allocated
+  use mpp_mod, only : mpp_sum
+  use mpp_mod, only : mpp_error, FATAL
+  use platform_mod
+
+  implicit none
+
+  integer                                       :: ierr
+  integer                                       :: i, pe, npes, root, fullsum, pesum
+  integer, allocatable, dimension(:)            :: pelist
+
+  call mpp_init(mpp_init_test_requests_allocated)
+  call mpp_set_stack_size(3145746)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  root = mpp_root_pe()
+  allocate( pelist(0:npes-2) )
+  if (pe .LE. npes-2) pelist = (/(i,i=0,npes-2)/)
+  if (pe .EQ. root) then
+    fullsum = sum( (/(i, i=1,npes, 1)/) )
+    pesum = sum( (/(i, i=1,npes-1, 1)/) )
+  endif
+
+  if( pe.EQ.root ) print *, '------------------> Calling test_mpp_sum <------------------'
+    call test_mpp_sum_scalar(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_2D(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_3D(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_4D(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_5D(pe,npes,root,pelist,fullsum,pesum)
+  if( pe.EQ.root ) print *, '------------------> Finished test_mpp_sum <------------------'
+
+  call MPI_FINALIZE(ierr)
+
+contains
+
+  subroutine test_mpp_sum_scalar(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+
+    call test_mpp_sum_scalar_r4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_scalar_r8(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_scalar_i4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_scalar_i8(pe,npes,root,pelist,fullsum,pesum)
+
+  end subroutine test_mpp_sum_scalar
+
+  !> Test the functionality of mpp_transmit for an r4_scalar.
+  subroutine test_mpp_sum_scalar_r4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    real(kind=r4_kind) :: a4, suma4, b4, sumb4
+
+    a4 = real(pe+1, kind=r4_kind)
+    b4 = a4
+    call mpp_sync()
+    call mpp_sum(a4)
+    if (pe .LE. npes-2) call mpp_sum(b4, pelist=pelist)
+    if (pe .EQ. root) then
+      suma4 = real(fullsum, kind=r4_kind)
+      sumb4 = real(pesum, kind=r4_kind)
+      if (a4 .ne. suma4) call mpp_error(FATAL, "Scalar_r4: mpp_sum differs from fortran intrinsic sum")
+      if (b4 .ne. sumb4) call mpp_error(FATAL, "Scalar_r4 with pelist: mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_scalar_r4
+
+  !> Test the functionality of mpp_transmit for an r8_scalar.
+  subroutine test_mpp_sum_scalar_r8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    real(kind=r8_kind) :: a8, suma8, b8, sumb8
+
+    a8 = real(pe+1, kind=r8_kind)
+    b8 = a8
+    call mpp_sync()
+    call mpp_sum(a8)
+    if (pe .LE. npes-2) call mpp_sum(b8, pelist=pelist)
+    if (pe .EQ. root) then
+      suma8 = real(fullsum, kind=r8_kind)
+      sumb8 = real(pesum, kind=r8_kind)
+      if (a8 .ne. suma8) call mpp_error(FATAL, "Scalar_r8: mpp_sum differs from fortran intrinsic sum")
+      if (b8 .ne. sumb8) call mpp_error(FATAL, "Scalar_r8 with pelist: mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_scalar_r8
+
+  !> Test the functionality of mpp_transmit for an i4_scalar.
+  subroutine test_mpp_sum_scalar_i4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer(kind=i4_kind) :: a4, suma4, b4, sumb4
+
+    a4 = int(pe+1, kind=i4_kind)
+    b4 = a4
+    call mpp_sync()
+    call mpp_sum(a4)
+    if (pe .LE. npes-2) call mpp_sum(b4, pelist=pelist)
+    if (pe .EQ. root) then
+      suma4 = int(fullsum, kind=i4_kind)
+      sumb4 = int(pesum, kind=i4_kind)
+      if (a4 .ne. suma4) call mpp_error(FATAL, "Scalar_i4: mpp_sum differs from fortran intrinsic sum")
+      if (b4 .ne. sumb4) call mpp_error(FATAL, "Scalar_i4 with pelist: mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_scalar_i4
+
+  !> Test the functionality of mpp_transmit for an i8_scalar.
+  subroutine test_mpp_sum_scalar_i8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer(kind=i8_kind) :: a8, suma8, b8, sumb8
+
+    a8 = int(pe+1, kind=i8_kind)
+    b8 = a8
+    call mpp_sync()
+    call mpp_sum(a8)
+    if (pe .LE. npes-2) call mpp_sum(b8, pelist=pelist)
+    if (pe .EQ. root) then
+      suma8 = int(fullsum, kind=i8_kind)
+      sumb8 = int(pesum, kind=i8_kind)
+      if (a8 .ne. suma8) call mpp_error(FATAL, "Scalar_i8: mpp_sum differs from fortran intrinsic sum")
+      if (b8 .ne. sumb8) call mpp_error(FATAL, "Scalar_i8 with pelist: mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_scalar_i8
+
+  subroutine test_mpp_sum_2D(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+
+    call test_mpp_sum_2D_r4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_2D_r8(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_2D_i4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_2D_i8(pe,npes,root,pelist,fullsum,pesum)
+
+  end subroutine test_mpp_sum_2D
+
+  !> Test the functionality of mpp_transmit for an r4_2D.
+  subroutine test_mpp_sum_2D_r4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=4
+    real(kind=r4_kind), dimension(2,2) :: a4, suma4, b4, sumb4, c4, sumc4
+
+    a4 = real(pe+1, kind=r4_kind)
+    b4 = a4
+    c4 = a4
+    call mpp_sync()
+    call mpp_sum(a4,n)
+    if (pe .LE. npes-2) call mpp_sum(b4, n, pelist=pelist)
+    call mpp_sum(c4,n-1)
+    if (pe .EQ. root) then
+      suma4 = real(fullsum, kind=r4_kind)
+      sumb4 = real(pesum, kind=r4_kind)
+      sumc4 = suma4
+      sumc4(2,2) = 1
+      if (all(a4 .ne. suma4)) call mpp_error(FATAL, "2D_r4: mpp_sum differs from fortran intrinsic sum")
+      if (all(b4 .ne. sumb4)) call mpp_error(FATAL, "2D_r4 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c4 .ne. sumc4)) call mpp_error(FATAL, "2D_r4 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_2D_r4
+
+  !> Test the functionality of mpp_transmit for an r8_2D.
+  subroutine test_mpp_sum_2D_r8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=4
+    real(kind=r8_kind), dimension(2,2) :: a8, suma8, b8, sumb8, c8, sumc8
+
+    a8 = real(pe+1, kind=r8_kind)
+    b8 = a8
+    c8 = a8
+    call mpp_sync()
+    call mpp_sum(a8,n)
+    if (pe .LE. npes-2) call mpp_sum(b8, n, pelist=pelist)
+    call mpp_sum(c8, n-1)
+    if (pe .EQ. root) then
+      suma8 = real(fullsum, kind=r8_kind)
+      sumb8 = real(pesum, kind=r8_kind)
+      sumc8 = suma8
+      sumc8(2,2) = 1
+      if (all(a8 .ne. suma8)) call mpp_error(FATAL, "2D_r8: mpp_sum differs from fortran intrinsic sum")
+      if (all(b8 .ne. sumb8)) call mpp_error(FATAL, "2D_r8 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c8 .ne. sumc8)) call mpp_error(FATAL, "2D_r8 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_2D_r8
+
+  !> Test the functionality of mpp_transmit for an i4_2D.
+  subroutine test_mpp_sum_2D_i4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=4
+    integer(kind=i4_kind), dimension(2,2) :: a4, suma4, b4, sumb4, c4, sumc4
+
+    a4 = int(pe+1, kind=i4_kind)
+    b4 = a4
+    c4 = a4
+    call mpp_sync()
+    call mpp_sum(a4,n)
+    if (pe .LE. npes-2) call mpp_sum(b4, n, pelist=pelist)
+    call mpp_sum(c4,n-1)
+    if (pe .EQ. root) then
+      suma4 = int(fullsum, kind=i4_kind)
+      sumb4 = int(pesum, kind=i4_kind)
+      sumc4 = suma4
+      sumc4(2,2) = 1
+      if (all(a4 .ne. suma4)) call mpp_error(FATAL, "2D_i4: mpp_sum differs from fortran intrinsic sum")
+      if (all(b4 .ne. sumb4)) call mpp_error(FATAL, "2D_i4 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c4 .ne. sumc4)) call mpp_error(FATAL, "2D_i4 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_2D_i4
+
+  !> Test the functionality of mpp_transmit for an i8_2D.
+  subroutine test_mpp_sum_2D_i8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=4
+    integer(kind=i8_kind), dimension(2,2) :: a8, suma8, b8, sumb8, c8, sumc8
+
+    a8 = int(pe+1, kind=i8_kind)
+    b8 = a8
+    c8 = a8
+    call mpp_sync()
+    call mpp_sum(a8,n)
+    if (pe .LE. npes-2) call mpp_sum(b8, n, pelist=pelist)
+    call mpp_sum(c8,n-1)
+    if (pe .EQ. root) then
+      suma8 = int(fullsum, kind=i8_kind)
+      sumb8 = int(pesum, kind=i8_kind)
+      sumc8 = suma8
+      sumc8(2,2) = 1
+      if (all(a8 .ne. suma8)) call mpp_error(FATAL, "2D_i8: mpp_sum differs from fortran intrinsic sum")
+      if (all(b8 .ne. sumb8)) call mpp_error(FATAL, "2D_i8 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c8 .ne. sumc8)) call mpp_error(FATAL, "2D_i8 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_2D_i8
+
+  subroutine test_mpp_sum_3D(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+
+    call test_mpp_sum_3D_r4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_3D_r8(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_3D_i4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_3D_i8(pe,npes,root,pelist,fullsum,pesum)
+
+  end subroutine test_mpp_sum_3D
+
+  !> Test the functionality of mpp_transmit for an r4_3D.
+  subroutine test_mpp_sum_3D_r4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=8
+    real(kind=r4_kind), dimension(2,2,2) :: a4, suma4, b4, sumb4, c4, sumc4
+
+    a4 = real(pe+1, kind=r4_kind)
+    b4 = a4
+    c4 = a4
+    call mpp_sync()
+    call mpp_sum(a4,n)
+    if (pe .LE. npes-2) call mpp_sum(b4, n, pelist=pelist)
+    call mpp_sum(c4, n-1)
+    if (pe .EQ. root) then
+      suma4 = real(fullsum, kind=r4_kind)
+      sumb4 = real(pesum, kind=r4_kind)
+      sumc4 = suma4
+      sumc4(2,2,2) = 1
+      if (all(a4 .ne. suma4)) call mpp_error(FATAL, "3D_r4: mpp_sum differs from fortran intrinsic sum")
+      if (all(b4 .ne. sumb4)) call mpp_error(FATAL, "3D_r4 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c4 .ne. sumc4)) call mpp_error(FATAL, "3D_r4 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_3D_r4
+
+  !> Test the functionality of mpp_transmit for an r8_3D.
+  subroutine test_mpp_sum_3D_r8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=8
+    real(kind=r8_kind), dimension(2,2,2) :: a8, suma8, b8, sumb8, c8, sumc8
+
+    a8 = real(pe+1, kind=r8_kind)
+    b8 = a8
+    c8 = a8
+    call mpp_sync()
+    call mpp_sum(a8,n)
+    if (pe .LE. npes-2) call mpp_sum(b8, n, pelist=pelist)
+    call mpp_sum(c8,n-1)
+    if (pe .EQ. root) then
+      suma8 = real(fullsum, kind=r8_kind)
+      sumb8 = real(pesum, kind=r8_kind)
+      sumc8 = suma8
+      sumc8(2,2,2) = 1
+      if (all(a8 .ne. suma8)) call mpp_error(FATAL, "3D_r8: mpp_sum differs from fortran intrinsic sum")
+      if (all(b8 .ne. sumb8)) call mpp_error(FATAL, "3D_r8 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c8 .ne. sumc8)) call mpp_error(FATAL, "3D_r8 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_3D_r8
+
+  !> Test the functionality of mpp_transmit for an i4_3D.
+  subroutine test_mpp_sum_3D_i4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=8
+    integer(kind=i4_kind), dimension(2,2,2) :: a4, suma4, b4, sumb4, c4, sumc4
+
+    a4 = int(pe+1, kind=i4_kind)
+    b4 = a4
+    c4 = a4
+    call mpp_sync()
+    call mpp_sum(a4,n)
+    if (pe .LE. npes-2) call mpp_sum(b4, n, pelist=pelist)
+    call mpp_sum(c4,n-1)
+    if (pe .EQ. root) then
+      suma4 = int(fullsum, kind=i4_kind)
+      sumb4 = int(pesum, kind=i4_kind)
+      sumc4 = suma4
+      sumc4(2,2,2) = 1
+      if (all(a4 .ne. suma4)) call mpp_error(FATAL, "3D_i4: mpp_sum differs from fortran intrinsic sum")
+      if (all(b4 .ne. sumb4)) call mpp_error(FATAL, "3D_i4 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c4 .ne. sumc4)) call mpp_error(FATAL, "3D_i4 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_3D_i4
+
+  !> Test the functionality of mpp_transmit for an i8_3D.
+  subroutine test_mpp_sum_3D_i8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=8
+    integer(kind=i8_kind), dimension(2,2,2) :: a8, suma8, b8, sumb8, c8, sumc8
+
+    a8 = int(pe+1, kind=i8_kind)
+    b8 = a8
+    c8 = a8
+    call mpp_sync()
+    call mpp_sum(a8,n)
+    if (pe .LE. npes-2) call mpp_sum(b8, n, pelist=pelist)
+    call mpp_sum(c8,n-1)
+    if (pe .EQ. root) then
+      suma8 = int(fullsum, kind=i8_kind)
+      sumb8 = int(pesum, kind=i8_kind)
+      sumc8 = suma8
+      sumc8(2,2,2) = 1
+      if (all(a8 .ne. suma8)) call mpp_error(FATAL, "3D_i8: mpp_sum differs from fortran intrinsic sum")
+      if (all(b8 .ne. sumb8)) call mpp_error(FATAL, "3D_i8 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c8 .ne. sumc8)) call mpp_error(FATAL, "3D_i8 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_3D_i8
+
+  subroutine test_mpp_sum_4D(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+
+    call test_mpp_sum_4D_r4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_4D_r8(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_4D_i4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_4D_i8(pe,npes,root,pelist,fullsum,pesum)
+
+  end subroutine test_mpp_sum_4D
+
+  !> Test the functionality of mpp_transmit for an r4_4D.
+  subroutine test_mpp_sum_4D_r4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=16
+    real(kind=r4_kind), dimension(2,2,2,2) :: a4, suma4, b4, sumb4, c4, sumc4
+
+    a4 = real(pe+1, kind=r4_kind)
+    b4 = a4
+    c4 = a4
+    call mpp_sync()
+    call mpp_sum(a4,n)
+    if (pe .LE. npes-2) call mpp_sum(b4, n, pelist=pelist)
+    call mpp_sum(c4,n-1)
+    if (pe .EQ. root) then
+      suma4 = real(fullsum, kind=r4_kind)
+      sumb4 = real(pesum, kind=r4_kind)
+      sumc4 = suma4
+      sumc4(2,2,2,2) = 1
+      if (all(a4 .ne. suma4)) call mpp_error(FATAL, "4D_r4: mpp_sum differs from fortran intrinsic sum")
+      if (all(b4 .ne. sumb4)) call mpp_error(FATAL, "4D_r4 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c4 .ne. sumc4)) call mpp_error(FATAL, "4D_r4 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_4D_r4
+
+  !> Test the functionality of mpp_transmit for an r8_4D.
+  subroutine test_mpp_sum_4D_r8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=16
+    real(kind=r8_kind), dimension(2,2,2,2) :: a8, suma8, b8, sumb8, c8, sumc8
+
+    a8 = real(pe+1, kind=r8_kind)
+    b8 = a8
+    c8 = a8
+    call mpp_sync()
+    call mpp_sum(a8,n)
+    if (pe .LE. npes-2) call mpp_sum(b8, n, pelist=pelist)
+    call mpp_sum(c8,n-1)
+    if (pe .EQ. root) then
+      suma8 = real(fullsum, kind=r8_kind)
+      sumb8 = real(pesum, kind=r8_kind)
+      sumc8 = suma8
+      sumc8(2,2,2,2) = 1
+      if (all(a8 .ne. suma8)) call mpp_error(FATAL, "4D_r8: mpp_sum differs from fortran intrinsic sum")
+      if (all(b8 .ne. sumb8)) call mpp_error(FATAL, "4D_r8 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c8 .ne. sumc8)) call mpp_error(FATAL, "4D_r8 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_4D_r8
+
+  !> Test the functionality of mpp_transmit for an i4_4D.
+  subroutine test_mpp_sum_4D_i4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=16
+    integer(kind=i4_kind), dimension(2,2,2,2) :: a4, suma4, b4, sumb4, c4, sumc4
+
+    a4 = int(pe+1, kind=i4_kind)
+    b4 = a4
+    c4 = a4
+    call mpp_sync()
+    call mpp_sum(a4,n)
+    if (pe .LE. npes-2) call mpp_sum(b4, n, pelist=pelist)
+    call mpp_sum(c4,n-1)
+    if (pe .EQ. root) then
+      suma4 = int(fullsum, kind=i4_kind)
+      sumb4 = int(pesum, kind=i4_kind)
+      sumc4 = suma4
+      sumc4(2,2,2,2) = 1
+      if (all(a4 .ne. suma4)) call mpp_error(FATAL, "4D_i4: mpp_sum differs from fortran intrinsic sum")
+      if (all(b4 .ne. sumb4)) call mpp_error(FATAL, "4D_i4 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c4 .ne. sumc4)) call mpp_error(FATAL, "4D_i4 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_4D_i4
+
+  !> Test the functionality of mpp_transmit for an i8_4D.
+  subroutine test_mpp_sum_4D_i8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=16
+    integer(kind=i8_kind), dimension(2,2,2,2) :: a8, suma8, b8, sumb8, c8, sumc8
+
+    a8 = int(pe+1, kind=i8_kind)
+    b8 = a8
+    c8 = a8
+    call mpp_sync()
+    call mpp_sum(a8,n)
+    if (pe .LE. npes-2) call mpp_sum(b8, n, pelist=pelist)
+    call mpp_sum(c8,n-1)
+    if (pe .EQ. root) then
+      suma8 = int(fullsum, kind=i8_kind)
+      sumb8 = int(pesum, kind=i8_kind)
+      sumc8 = suma8
+      sumc8(2,2,2,2) = 1
+      if (all(a8 .ne. suma8)) call mpp_error(FATAL, "4D_i8: mpp_sum differs from fortran intrinsic sum")
+      if (all(b8 .ne. sumb8)) call mpp_error(FATAL, "4D_i8 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c8 .ne. sumc8)) call mpp_error(FATAL, "4D_i8 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_4D_i8
+
+  subroutine test_mpp_sum_5D(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+
+    call test_mpp_sum_5D_r4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_5D_r8(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_5D_i4(pe,npes,root,pelist,fullsum,pesum)
+    call test_mpp_sum_5D_i8(pe,npes,root,pelist,fullsum,pesum)
+
+  end subroutine test_mpp_sum_5D
+
+  !> Test the functionality of mpp_transmit for an r4_5D.
+  subroutine test_mpp_sum_5D_r4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=32
+    real(kind=r4_kind), dimension(2,2,2,2,2) :: a4, suma4, b4, sumb4, c4, sumc4
+
+    a4 = real(pe+1, kind=r4_kind)
+    b4 = a4
+    c4 = a4
+    call mpp_sync()
+    call mpp_sum(a4,n)
+    if (pe .LE. npes-2) call mpp_sum(b4, n, pelist=pelist)
+    call mpp_sum(c4,n-1)
+    if (pe .EQ. root) then
+      suma4 = real(fullsum, kind=r4_kind)
+      sumb4 = real(pesum, kind=r4_kind)
+      sumc4 = suma4
+      sumc4(2,2,2,2,2) = 1
+      if (all(a4 .ne. suma4)) call mpp_error(FATAL, "5D_r4: mpp_sum differs from fortran intrinsic sum")
+      if (all(b4 .ne. sumb4)) call mpp_error(FATAL, "5D_r4 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c4 .ne. sumc4)) call mpp_error(FATAL, "5D_r4 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_5D_r4
+
+  !> Test the functionality of mpp_transmit for an r8_5D.
+  subroutine test_mpp_sum_5D_r8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=32
+    real(kind=r8_kind), dimension(2,2,2,2,2) :: a8, suma8, b8, sumb8, c8, sumc8
+
+    a8 = real(pe+1, kind=r8_kind)
+    b8 = a8
+    c8 = a8
+    call mpp_sync()
+    call mpp_sum(a8,n)
+    if (pe .LE. npes-2) call mpp_sum(b8, n, pelist=pelist)
+    call mpp_sum(c8,n-1)
+    if (pe .EQ. root) then
+      suma8 = real(fullsum, kind=r8_kind)
+      sumb8 = real(pesum, kind=r8_kind)
+      sumc8 = suma8
+      sumc8(2,2,2,2,2) = 1
+      if (all(a8 .ne. suma8)) call mpp_error(FATAL, "5D_r8: mpp_sum differs from fortran intrinsic sum")
+      if (all(b8 .ne. sumb8)) call mpp_error(FATAL, "5D_r8 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c8 .ne. sumc8)) call mpp_error(FATAL, "5D_r8 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_5D_r8
+
+  !> Test the functionality of mpp_transmit for an i4_5D.
+  subroutine test_mpp_sum_5D_i4(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=32
+    integer(kind=i4_kind), dimension(2,2,2,2,2) :: a4, suma4, b4, sumb4, c4, sumc4
+
+    a4 = int(pe+1, kind=i4_kind)
+    b4 = a4
+    c4 = a4
+    call mpp_sync()
+    call mpp_sum(a4,n)
+    if (pe .LE. npes-2) call mpp_sum(b4, n, pelist=pelist)
+    call mpp_sum(c4,n-1)
+    if (pe .EQ. root) then
+      suma4 = int(fullsum, kind=i4_kind)
+      sumb4 = int(pesum, kind=i4_kind)
+      sumc4 = suma4
+      sumc4(2,2,2,2,2) = 1
+      if (all(a4 .ne. suma4)) call mpp_error(FATAL, "5D_i4: mpp_sum differs from fortran intrinsic sum")
+      if (all(b4 .ne. sumb4)) call mpp_error(FATAL, "5D_i4 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c4 .ne. sumc4)) call mpp_error(FATAL, "5D_i4 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_5D_i4
+
+  !> Test the functionality of mpp_transmit for an i8_5D.
+  subroutine test_mpp_sum_5D_i8(pe,npes,root,pelist,fullsum,pesum)
+    integer, intent(in) :: npes, pe, root, fullsum, pesum
+    integer, intent(in), dimension(0:npes-2) :: pelist
+    integer, parameter :: n=32
+    integer(kind=i8_kind), dimension(2,2,2,2,2) :: a8, suma8, b8, sumb8, c8, sumc8
+
+    a8 = int(pe+1, kind=i8_kind)
+    b8 = a8
+    c8 = a8
+    call mpp_sync()
+    call mpp_sum(a8,n)
+    if (pe .LE. npes-2) call mpp_sum(b8, n, pelist=pelist)
+    call mpp_sum(c8,n-1)
+    if (pe .EQ. root) then
+      suma8 = int(fullsum, kind=i8_kind)
+      sumb8 = int(pesum, kind=i8_kind)
+      sumc8 = suma8
+      sumc8(2,2,2,2,2) = 1
+      if (all(a8 .ne. suma8)) call mpp_error(FATAL, "5D_i8: mpp_sum differs from fortran intrinsic sum")
+      if (all(b8 .ne. sumb8)) call mpp_error(FATAL, "5D_i8 with pelist: mpp_sum differs from fortran intrinsic sum")
+      if (all(c8 .ne. sumc8)) call mpp_error(FATAL, "5D_i8 (shorter length): mpp_sum differs from fortran intrinsic sum")
+    endif
+
+  end subroutine test_mpp_sum_5D_i8
+
+end program test_mpp_sum
diff --git a/test_fms/mpp/test_mpp_sum.sh b/test_fms/mpp/test_mpp_sum.sh
new file mode 100755
index 0000000000..e4a3a8b1fb
--- /dev/null
+++ b/test_fms/mpp/test_mpp_sum.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Lauren Chilutti 08/18/2020
+
+# Set common test settings.
+. ../test_common.sh
+
+# Run the test for 5 processors
+run_test test_mpp_sum 5
diff --git a/test_fms/mpp/test_mpp_transmit.F90 b/test_fms/mpp/test_mpp_transmit.F90
new file mode 100644
index 0000000000..69e0e1f855
--- /dev/null
+++ b/test_fms/mpp/test_mpp_transmit.F90
@@ -0,0 +1,582 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @file
+!! @author Lauren Chilutti
+!! @brief Test program for the mpp_transmit interface.
+!! @email gfdl.climate.model.info@noaa.gov
+!! @description This test program is for testing the mpp_transmit interface.
+
+program test_mpp_transmit
+
+  use mpp_mod, only : mpp_init, mpp_pe, mpp_npes, mpp_root_pe
+  use mpp_mod, only : mpp_sync, mpp_sync_self
+  use mpp_mod, only : mpp_set_stack_size, mpp_init_test_requests_allocated
+  use mpp_mod, only : mpp_transmit, ALL_PES, NULL_PE
+  use mpp_mod, only : mpp_error, FATAL
+  use platform_mod
+
+  implicit none
+
+  integer                                       :: ierr
+  integer                                       :: pe, npes, root
+
+  call mpp_init(test_level=mpp_init_test_requests_allocated)
+  call mpp_set_stack_size(3145746)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  root = mpp_root_pe()
+
+  if( pe.EQ.root ) print *, '------------------> Calling test_mpp_transmit <------------------'
+    call test_mpp_transmit_null_pe(npes,pe,root)
+    call test_mpp_transmit_all_pes(npes,pe,root)
+    call test_mpp_transmit_scalar(npes,pe,root)
+    call test_mpp_transmit_2D(npes,pe,root)
+    call test_mpp_transmit_3D(npes,pe,root)
+    call test_mpp_transmit_4D(npes,pe,root)
+  if( pe.EQ.root ) print *, '------------------> Finished test_mpp_transmit <------------------'
+
+  call MPI_FINALIZE(ierr)
+
+contains
+
+  !> Test the use of NULL_PE as an argument. Only testing once for an r4_scalar.
+  subroutine test_mpp_transmit_null_pe(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=1
+    real(kind=r4_kind) :: a4
+
+    ! Initializing a4 as a unique number for each pe
+    a4 = real(pe, kind=r4_kind)
+    call mpp_sync()
+
+    if (pe .EQ. 0) then
+      do i = 1,npes-1
+        call mpp_transmit( put_data=a4, plen=n, to_pe=i, get_data=a4, glen=n, from_pe=NULL_PE )
+        call mpp_sync_self()
+      end do
+    else
+      call mpp_transmit( put_data=a4, plen=n, to_pe=NULL_PE, get_data=a4, glen=n, from_pe=0 )
+      call mpp_sync_self()
+    end if
+
+    ! a4 should equal 0 for all pes
+    if (a4 .NE. 0) call mpp_error(FATAL, "Test_mpp_transmit_null_pe: transmit didn't go as expected")
+  end subroutine test_mpp_transmit_null_pe
+
+  !> Test the use of ALL_PES as an argument.  Only testing once for an r4_scalar.
+  subroutine test_mpp_transmit_all_pes(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer, parameter :: n=1
+    real(kind=r4_kind)  :: a4
+
+    ! Initializing a4 as a unique number for each pe
+    a4 = real(pe, kind=r4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4, plen=n, to_pe=ALL_PES, &
+                       get_data=a4, glen=n, from_pe=root )
+    call mpp_sync_self()
+    ! a4 should equal 0 for all pes
+    if (a4 .NE. 0) call mpp_error(FATAL, "Test_mpp_transmit_all_pes: transmit didn't go as expected")
+  end subroutine test_mpp_transmit_all_pes
+
+  subroutine test_mpp_transmit_scalar(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+
+    call test_mpp_transmit_r4_scalar(npes,pe,root)
+    call test_mpp_transmit_r8_scalar(npes,pe,root)
+    call test_mpp_transmit_i4_scalar(npes,pe,root)
+    call test_mpp_transmit_i8_scalar(npes,pe,root)
+
+  end subroutine test_mpp_transmit_scalar
+
+  !> Test the functionality of mpp_transmit for an r4_scalar.
+  subroutine test_mpp_transmit_r4_scalar(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer, parameter :: n=1
+    real(kind=r4_kind) :: a4, b4, c4
+
+    ! Initializing a4 as a unique number for each pe
+    a4 = real(pe, kind=r4_kind)
+    b4 = real(0, kind=r4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4, plen=n, to_pe=modulo(pe+1, npes), &
+                       get_data=b4, glen=n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+    c4 = real(modulo(int(a4)+npes-1,npes), kind=r4_kind)
+    ! b4 should now equal the value of a4 from the "from_pe"
+    if (b4 .NE. c4 ) call mpp_error(FATAL, "Test_mpp_transmit_r4_scalar: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_r4_scalar
+
+  !> Test the functionality of mpp_transmit for an r8_scalar.
+  subroutine test_mpp_transmit_r8_scalar(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer, parameter :: n=1
+    real(kind=r8_kind) :: a8, b8, c8
+
+    ! Initializing a8 as a unique number for each pe
+    a8 = real(pe, kind=r8_kind)
+    b8 = real(0, kind=r8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8, plen=n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8, glen=n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+    c8 = real(modulo(int(a8)+npes-1,npes), kind=r8_kind)
+    ! b8 should now equal the value of a8 from the "from_pe"
+    if (b8 .NE. c8 ) call mpp_error(FATAL, "Test_mpp_transmit_r8_scalar: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_r8_scalar
+
+  !> Test the functionality of mpp_transmit for an i4_scalar.
+  subroutine test_mpp_transmit_i4_scalar(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer, parameter :: n=1
+    integer(kind=i4_kind) :: a4, b4, c4
+
+    ! Initializing a4 as a unique number for each pe
+    a4 = int(pe, kind=i4_kind)
+    b4 = int(0, kind=i4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4, plen=n, to_pe=modulo(pe+1, npes), &
+                       get_data=b4, glen=n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+    c4 = int(modulo(a4+npes-1,npes), kind=i4_kind)
+    ! b4 should now equal the value of a4 from the "from_pe"
+    if (b4 .NE. c4 ) call mpp_error(FATAL, "Test_mpp_transmit_i4_scalar: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i4_scalar
+
+  !> Test the functionality of mpp_transmit for an i8_scalar.
+  subroutine test_mpp_transmit_i8_scalar(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer, parameter :: n=1
+    integer(kind=i8_kind) :: a8, b8, c8
+
+    ! Initializing a8 as a unique number for each pe
+    a8 = int(pe, kind=i8_kind)
+    b8 = int(0, kind=i8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8, plen=n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8, glen=n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+    c8 = int(modulo(a8+npes-1,npes), kind=i8_kind)
+    ! b8 should now equal the value of a8 from the "from_pe"
+    if (b8 .NE. c8 ) call mpp_error(FATAL, "Test_mpp_transmit_i8_scalar: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i8_scalar
+
+  subroutine test_mpp_transmit_2D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+
+    call test_mpp_transmit_r4_2D(npes,pe,root)
+    call test_mpp_transmit_r8_2D(npes,pe,root)
+    call test_mpp_transmit_i4_2D(npes,pe,root)
+    call test_mpp_transmit_i8_2D(npes,pe,root)
+
+  end subroutine test_mpp_transmit_2D
+
+  !> Test the functionality of mpp_transmit for an r4_2D.
+  subroutine test_mpp_transmit_r4_2D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=2
+    real(kind=r4_kind), dimension(2,2) :: a4, b4, c4
+
+    ! Initilizing the a4 array with unique numbers for each element and pe
+    a4 = real( reshape((/(i, i=pe, pe+(n**2-1))/), shape(a4)), kind=r4_kind)
+    b4 = real(0, kind=r4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4(1,1), plen=n**2, to_pe=modulo(pe+1, npes), &
+                       get_data=b4(1,1), glen=n**2, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c4 = reshape((/(i, i=modulo(npes+pe-1,npes),modulo(npes+pe-1,npes)+(2**n-1))/),shape(c4))
+    ! b4(1,1) should now equal the value of a4(1,1)from the "from_pe"
+    if (all(b4 .NE. c4) ) call mpp_error(FATAL, "Test_mpp_transmit_r4_2D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_r4_2D
+
+  !> Test the functionality of mpp_transmit for an r8_2D.
+  subroutine test_mpp_transmit_r8_2D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=2
+    real(kind=r8_kind), dimension(2,2) :: a8, b8, c8
+
+    ! Initilizing the a8 array with unique numbers for each element and pe
+    a8 = real( reshape((/(i, i=pe, pe+(n**2-1))/), shape(a8)), kind=r8_kind)
+    b8 = real(0, kind=r8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8(1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8(1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c8 = reshape((/(i, i=modulo(npes+pe-1,npes),modulo(npes+pe-1,npes)+(2**n-1))/),shape(c8))
+    ! b8 should now equal the value of a8 from the "from_pe"
+    if (all(b8 .NE. c8) ) call mpp_error(FATAL, "Test_mpp_transmit_r8_2D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_r8_2D
+
+  !> Test the functionality of mpp_transmit for an i4_2D.
+  subroutine test_mpp_transmit_i4_2D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=2
+    integer(kind=i4_kind), dimension(2,2) :: a4, b4, c4
+
+    ! Initilizing the a4 array with unique numbers for each element and pe
+    a4 = int( reshape((/(i, i=pe, pe+(n**2-1))/), shape(a4)), kind=i4_kind)
+    b4 = int(0, kind=i4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4(1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b4(1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c4 = reshape((/(i, i=modulo(npes+pe-1,npes),modulo(npes+pe-1,npes)+(2**n-1))/),shape(c4))
+    ! b4 should now equal the value of a4 from the "from_pe"
+    if (all(b4 .NE. c4) ) call mpp_error(FATAL, "Test_mpp_transmit_i4_2D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i4_2D
+
+  !> Test the functionality of mpp_transmit for an i8_2D.
+  subroutine test_mpp_transmit_i8_2D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=2
+    integer(kind=i8_kind), dimension(2,2) :: a8, b8, c8
+
+    ! Initilizing the a8 array with unique numbers for each element and pe
+    a8 = int( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a8)), kind=i8_kind)
+    b8 = int(0, kind=i8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8(1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8(1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c8 = reshape((/(i, i=modulo(npes+pe-1,npes),modulo(npes+pe-1,npes)+(2**n-1))/),shape(c8))
+    ! b8 should now equal the value of a8 from the "from_pe"
+    if ( all(b8 .NE. c8) ) call mpp_error(FATAL, "Test_mpp_transmit_i8_2D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i8_2D
+
+  subroutine test_mpp_transmit_3D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+
+    call test_mpp_transmit_r4_3D(npes,pe,root)
+    call test_mpp_transmit_r8_3D(npes,pe,root)
+    call test_mpp_transmit_i4_3D(npes,pe,root)
+    call test_mpp_transmit_i8_3D(npes,pe,root)
+
+  end subroutine test_mpp_transmit_3D
+
+  !> Test the functionality of mpp_transmit for an r4_3D.
+  subroutine test_mpp_transmit_r4_3D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=3
+    real(kind=r4_kind), dimension(2,2,2) :: a4, b4, c4
+
+    ! Initilizing the a4 array with unique numbers for each element and pe
+    a4 = real( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a4)), kind=r4_kind)
+    b4 = real(0, kind=r4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4(1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b4(1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c4 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c4))
+    ! b4 should now equal the value of a4 from the "from_pe"
+    if ( all(b4 .NE. c4) ) call mpp_error(FATAL, "Test_mpp_transmit_r4_3D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_r4_3D
+
+  !> Test the functionality of mpp_transmit for an r8_3D.
+  subroutine test_mpp_transmit_r8_3D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=3
+    real(kind=r8_kind), dimension(2,2,2) :: a8, b8, c8
+
+    ! Initilizing the a8 array with unique numbers for each element and pe
+    a8 = real( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a8)), kind=r8_kind)
+    b8 = real(0, kind=r8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8(1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8(1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c8 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c8))
+    ! b8 should now equal the value of a8 from the "from_pe"
+    if ( all(b8 .NE. c8) ) call mpp_error(FATAL, "Test_mpp_transmit_r8_3D: transmit didn't go as expected")
+
+
+  end subroutine test_mpp_transmit_r8_3D
+
+  !> Test the functionality of mpp_transmit for an i4_3D.
+  subroutine test_mpp_transmit_i4_3D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=3
+    integer(kind=i4_kind), dimension(2,2,2) :: a4, b4, c4
+
+    ! Initilizing the a4 array with unique numbers for each element and pe
+    a4 = int( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a4)), kind=i4_kind)
+    b4 = int(0, kind=i4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4(1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b4(1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c4 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c4))
+    ! b4 should now equal the value of a4 from the "from_pe"
+    if ( all(b4 .NE. c4) ) call mpp_error(FATAL, "Test_mpp_transmit_i4_3D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i4_3D
+
+  !> Test the functionality of mpp_transmit for an i8_3D.
+  subroutine test_mpp_transmit_i8_3D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=3
+    integer(kind=i8_kind), dimension(2,2,2) :: a8, b8, c8
+
+    ! Initilizing the a8 array with unique numbers for each element and pe
+    a8 = int( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a8)), kind=i8_kind)
+    b8 = int(0, kind=i8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8(1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8(1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c8 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c8))
+    ! b8 should now equal the value of a8(1,1,1)from the "from_pe"
+    if ( all(b8 .NE. c8) ) call mpp_error(FATAL, "Test_mpp_transmit_i8_3D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i8_3D
+
+  subroutine test_mpp_transmit_4D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+
+    call test_mpp_transmit_r4_4D(npes,pe,root)
+    call test_mpp_transmit_r8_4D(npes,pe,root)
+    call test_mpp_transmit_i4_4D(npes,pe,root)
+    call test_mpp_transmit_i8_4D(npes,pe,root)
+
+  end subroutine test_mpp_transmit_4D
+
+  !> Test the functionality of mpp_transmit for an r4_4D.
+  subroutine test_mpp_transmit_r4_4D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=4
+    real(kind=r4_kind), dimension(2,2,2,2) :: a4, b4, c4
+
+    ! Initilizing the a4 array with unique numbers for each element and pe
+    a4 = real( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a4)), kind=r4_kind)
+    b4 = real(0, kind=r4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4(1,1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b4(1,1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c4 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c4))
+    ! b4 should now equal the value of a4 from the "from_pe"
+    if ( all(b4 .NE. c4) ) call mpp_error(FATAL, "Test_mpp_transmit_r4_4D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_r4_4D
+
+  !> Test the functionality of mpp_transmit for an r8_4D.
+  subroutine test_mpp_transmit_r8_4D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=4
+    real(kind=r8_kind), dimension(2,2,2,2) :: a8, b8, c8
+
+    ! Initilizing the a4 array with unique numbers for each element and pe
+    a8 = real( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a8)), kind=r8_kind)
+    b8 = real(0, kind=r8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8(1,1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8(1,1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c8 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c8))
+    ! b8 should now equal the value of a8 from the "from_pe"
+    if ( all(b8 .NE. c8) ) call mpp_error(FATAL, "Test_mpp_transmit_r8_4D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_r8_4D
+
+  !> Test the functionality of mpp_transmit for an i4_4D.
+  subroutine test_mpp_transmit_i4_4D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=4
+    integer(kind=i4_kind), dimension(2,2,2,2) :: a4, b4, c4
+
+    ! Initilizing the a4 array with unique numbers for each element and pe
+    a4 = int( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a4)), kind=i4_kind)
+    b4 = int(0, kind=i4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4(1,1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b4(1,1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c4 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c4))
+    ! b4 should now equal the value of a4 from the "from_pe"
+    if ( all(b4 .NE. c4) ) call mpp_error(FATAL, "Test_mpp_transmit_i4_4D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i4_4D
+
+  !> Test the functionality of mpp_transmit for an i8_4D.
+  subroutine test_mpp_transmit_i8_4D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=4
+    integer(kind=i8_kind), dimension(2,2,2,2) :: a8, b8, c8
+
+    ! Initilizing the a8 array with unique numbers for each element and pe
+    a8 = int( reshape((/(i, i=pe, pe+(n**2-1))/), shape(a8)), kind=i8_kind)
+    b8 = int(0, kind=i8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8(1,1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8(1,1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c8 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c8))
+    ! b8 should now equal the value of a8 from the "from_pe"
+    if ( all(b8 .NE. c8) ) call mpp_error(FATAL, "Test_mpp_transmit_i8_4D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i8_4D
+
+  subroutine test_mpp_transmit_5D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+
+    call test_mpp_transmit_r4_5D(npes,pe,root)
+    call test_mpp_transmit_r8_5D(npes,pe,root)
+    call test_mpp_transmit_i4_5D(npes,pe,root)
+    call test_mpp_transmit_i8_5D(npes,pe,root)
+
+  end subroutine test_mpp_transmit_5D
+
+  !> Test the functionality of mpp_transmit for an r4_5D.
+  subroutine test_mpp_transmit_r4_5D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=5
+    real(kind=r4_kind), dimension(2,2,2,2,2) :: a4, b4, c4
+
+    ! Initilizing the a4 array with unique numbers for each element and pe
+    a4 = real( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a4)), kind=r4_kind)
+    b4 = real(0, kind=r4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4(1,1,1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b4(1,1,1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c4 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c4))
+    ! b4 should now equal the value of a4 from the "from_pe"
+    if ( all(b4 .NE. c4) ) call mpp_error(FATAL, "Test_mpp_transmit_r4_5D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_r4_5D
+
+  !> Test the functionality of mpp_transmit for an r8_5D.
+  subroutine test_mpp_transmit_r8_5D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=5
+    real(kind=r8_kind), dimension(2,2,2,2,2) :: a8, b8, c8
+
+    ! Initilizing the a4 array with unique numbers for each element and pe
+    a8 = real( reshape((/(i, i=pe, pe+(2**n-1))/), shape(a8)), kind=r8_kind)
+    b8 = real(0, kind=r8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8(1,1,1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8(1,1,1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c8 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c8))
+    ! b8 should now equal the value of a8 from the "from_pe"
+    if ( all(b8 .NE. c8) ) call mpp_error(FATAL, "Test_mpp_transmit_r8_5D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_r8_5D
+
+  !> Test the functionality of mpp_transmit for an i4_5D.
+  subroutine test_mpp_transmit_i4_5D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=5
+    integer(kind=i4_kind), dimension(2,2,2,2,2) :: a4, b4, c4
+
+    ! Initilizing the a8 array with unique numbers for each element and pe
+    a4 = int( reshape((/(i, i=pe, pe+(n**2-1))/), shape(a4)), kind=i4_kind)
+    b4 = int(0, kind=i4_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a4(1,1,1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b4(1,1,1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c4 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c4))
+    ! b4 should now equal the value of a4 from the "from_pe"
+    if ( all(b4 .NE. c4) ) call mpp_error(FATAL, "Test_mpp_transmit_i4_5D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i4_5D
+
+  !> Test the functionality of mpp_transmit for an i8_5D.
+  subroutine test_mpp_transmit_i8_5D(npes,pe,root)
+    integer, intent(in) :: npes, pe, root
+    integer :: i
+    integer, parameter :: n=5
+    integer(kind=i8_kind), dimension(2,2,2,2,2) :: a8, b8, c8
+
+    ! Initilizing the a8 array with unique numbers for each element and pe
+    a8 = int( reshape((/(i, i=pe, pe+(n**2-1))/), shape(a8)), kind=i8_kind)
+    b8 = int(0, kind=i8_kind)
+    call mpp_sync()
+
+    call mpp_transmit( put_data=a8(1,1,1,1,1), plen=2**n, to_pe=modulo(pe+1, npes), &
+                       get_data=b8(1,1,1,1,1), glen=2**n, from_pe=modulo(npes+pe-1, npes) )
+    call mpp_sync_self()
+
+    c8 = reshape((/(i, i=modulo(npes+pe-1,npes), modulo(npes+pe-1,npes)+(2**n-1))/), shape(c8))
+    ! b8 should now equal the value of a8 from the "from_pe"
+    if ( all(b8 .NE. c8) ) call mpp_error(FATAL, "Test_mpp_transmit_i8_5D: transmit didn't go as expected")
+
+  end subroutine test_mpp_transmit_i8_5D
+end program test_mpp_transmit
diff --git a/test_fms/mpp/test_mpp_transmit.sh b/test_fms/mpp/test_mpp_transmit.sh
new file mode 100755
index 0000000000..fdf003a109
--- /dev/null
+++ b/test_fms/mpp/test_mpp_transmit.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Lauren Chilutti 09/09/2020
+
+# Set common test settings.
+. ../test_common.sh
+
+# Run the test for 5 processors
+run_test test_mpp_transmit 6
diff --git a/test_fms/mpp/test_mpp_update_domains b/test_fms/mpp/test_mpp_update_domains
new file mode 100755
index 0000000000..a9102df74b
--- /dev/null
+++ b/test_fms/mpp/test_mpp_update_domains
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_mpp_update_domains - temporary wrapper script for .libs/test_mpp_update_domains
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_mpp_update_domains program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_mpp_update_domains:test_mpp_update_domains:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_mpp_update_domains:test_mpp_update_domains:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_mpp_update_domains:test_mpp_update_domains:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_mpp_update_domains'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_mpp_update_domains.sh b/test_fms/mpp/test_mpp_update_domains.sh
new file mode 100755
index 0000000000..535af6491c
--- /dev/null
+++ b/test_fms/mpp/test_mpp_update_domains.sh
@@ -0,0 +1,44 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Jessica Liptak
+
+# Set common test settings.
+. ../test_common.sh
+# Run the test for one processor
+echo "Running test_mpp_update_domains with 1 pe"
+run_test test_mpp_update_domains 1
+# If on a Linux system that uses the command `nproc`, run the test
+if [ $(command -v nproc) ]
+ # Looks like a linux system
+ then
+   # Get the number of available CPUs on the system
+   nProc=$(nproc)
+   if [ ${nProc} -ge 2 ]
+     then
+       # Run the test with 2 pes
+       echo "Running test_mpp_update_domains with 2 pes"
+       run_test test_mpp_update_domains 2
+   fi
+fi
diff --git a/test_fms/mpp/test_mpp_update_domains_ad.F90 b/test_fms/mpp/test_mpp_update_domains_ad.F90
new file mode 100644
index 0000000000..07739e8e65
--- /dev/null
+++ b/test_fms/mpp/test_mpp_update_domains_ad.F90
@@ -0,0 +1,391 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Jessica Liptak
+!> @brief Test mpp_update_domains_ad using different layouts and data precision
+program test_mpp_update_domains_ad
+  use mpp_mod, only : FATAL, WARNING, NOTE
+  use mpp_mod, only : mpp_init, mpp_pe, mpp_npes, mpp_root_pe, mpp_error
+  use mpp_mod, only : mpp_set_stack_size
+  use mpp_mod, only : mpp_transmit, mpp_sum, mpp_sync
+  use mpp_mod, only : mpp_init_test_requests_allocated
+  use mpp_domains_mod, only : GLOBAL_DATA_DOMAIN
+  use mpp_domains_mod, only : CGRID_NE, MPP_DOMAIN_TIME
+  use mpp_domains_mod, only : domain2D
+  use mpp_domains_mod, only : mpp_get_compute_domain, mpp_get_data_domain, mpp_domains_set_stack_size
+  use mpp_domains_mod, only : mpp_global_field, mpp_global_sum
+  use mpp_domains_mod, only : mpp_domains_init, mpp_domains_exit
+  use mpp_domains_mod, only : mpp_update_domains, mpp_update_domains_ad, mpp_check_field
+  use mpp_domains_mod, only : mpp_define_layout, mpp_define_domains, mpp_modify_domain
+  use mpp_domains_mod, only : mpp_get_global_domain
+  use mpp_io_mod, only : mpp_io_init
+  use platform_mod, only : r4_kind, r8_kind
+
+  implicit none
+
+  integer :: ierr, id
+  integer :: pe, npes
+  integer :: nx=64, ny=64, nz=10, stackmax=10000000
+  integer :: i, j, k, n
+  integer :: layout(2)
+  integer :: mpes = 0
+  integer :: whalo = 2, ehalo = 2, shalo = 2, nhalo = 2
+  !> Initialize mpp and mpp IO modules
+  call mpp_init(test_level=mpp_init_test_requests_allocated)
+  call mpp_domains_init(MPP_DOMAIN_TIME)
+  call mpp_io_init()
+  call mpp_domains_set_stack_size(stackmax)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  !> run the tests
+  if (pe == mpp_root_pe()) &
+    print *, '--------------------> Calling test_halo_update_ad_r8(Simple) <-------------------'
+  call test_halo_update_ad_r8('Simple')
+
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Calling test_halo_update_ad_r4(Simple) <-------------------'
+  call test_halo_update_ad_r4('Simple')
+
+  call mpp_domains_exit()
+  !> Finalize mpp
+  call MPI_FINALIZE(ierr)
+contains
+  !> test calling mpp_halo_update_ad on a 3D 64-bit real data array
+  subroutine test_halo_update_ad_r8( test_type )
+    character(len=*), intent(in) :: test_type
+    ! local
+    type(domain2D) :: domain
+    integer              :: shift, i, j, k
+    logical              :: is_symmetry
+    integer              :: is, ie, js, je, isd, ied, jsd, jed, pe
+    real(kind=r8_kind), allocatable, dimension(:,:,:) :: x_ad, y_ad, x_fd, y_fd, x_save, y_save
+    real(kind=r8_kind) :: ad_sum, fd_sum, sum_diff
+
+    if(index(test_type, 'symmetry') == 0) then
+       is_symmetry = .false.
+    else
+       is_symmetry = .true.
+    end if
+    select case(test_type)
+    case( 'Simple', 'Simple symmetry' )
+      call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+      call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+                              shalo=shalo, nhalo=nhalo, name=test_type, symmetry = is_symmetry )
+    case default
+      call mpp_error( FATAL, 'test_mpp_update_domains_ad_r8: '//test_type//' is not a valid test.')
+    end select
+
+!set up x array
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+
+    shift=1
+!---test 3d single fields----------------------------------------------------------
+    allocate( x_fd(isd:ied,jsd:jed,nz) )
+    allocate( x_ad(isd:ied,jsd:jed,nz) )
+    allocate( x_save(isd:ied,jsd:jed,nz) )
+    x_fd = 0.0; x_ad = 0.0; x_save = 0.0
+
+    do k = 1,nz
+      do j = js,je
+        do i = is,ie
+          x_fd(i,j,k) = i*j
+        end do
+      end do
+    end do
+    x_save = x_fd
+
+    ! full update
+    call mpp_update_domains( x_fd, domain )
+
+    fd_sum = 0.0
+    do k = 1,nz
+      do j = jsd,jed
+        do i = isd,ied
+          fd_sum = fd_sum + x_fd(i,j,k)*x_fd(i,j,k)
+        end do
+      end do
+    end do
+    call mpp_sum( fd_sum )
+
+    x_ad = x_fd
+    call mpp_update_domains_ad( x_ad, domain )
+
+    ad_sum = 0.0
+    do k = 1,nz
+      do j = jsd,jed
+        do i = isd,ied
+          ad_sum = ad_sum + x_ad(i,j,k)*x_save(i,j,k)
+        end do
+      end do
+    end do
+    call mpp_sum( ad_sum )
+    call mpp_sync()
+    pe = mpp_pe()
+    sum_diff = 0.0
+    sum_diff = abs(ad_sum-fd_sum)/fd_sum
+
+    if( pe.EQ.mpp_root_pe() ) then
+      if (sum_diff .lt. 1E-7) then
+        call MPP_ERROR(NOTE, "Passed Adjoint Dot Test: mpp_update_domains_ad_r8(single 3D field)")
+      else
+        call MPP_ERROR(FATAL, "FAILED Adjoint Dot Test: mpp_update_domains_ad_r8(single 3D field)")
+      endif
+    endif
+
+    deallocate (x_ad, x_fd, x_save)
+
+    ! test 3d vector fields
+    allocate( x_ad  (isd:ied+shift,jsd:jed  ,nz) )
+    allocate( x_fd  (isd:ied+shift,jsd:jed  ,nz) )
+    allocate( x_save(isd:ied+shift,jsd:jed  ,nz) )
+    allocate( y_ad  (isd:ied  ,jsd:jed+shift,nz) )
+    allocate( y_fd  (isd:ied  ,jsd:jed+shift,nz) )
+    allocate( y_save(isd:ied  ,jsd:jed+shift,nz) )
+
+    x_fd=0; y_fd=0
+    do k = 1,nz
+      do j = js,je
+        do i = is,ie
+          x_fd(i,j,k)=i*j
+          y_fd(i,j,k)=i*j
+        end do
+      end do
+    end do
+
+    call mpp_update_domains( x_fd, y_fd, domain, gridtype=CGRID_NE)
+    x_save=x_fd
+    y_save=y_fd
+
+    fd_sum = 0.
+    do k = 1,nz
+      do j = jsd,jed
+        do i = isd,ied+shift
+          fd_sum = fd_sum + x_fd(i,j,k)*x_fd(i,j,k)
+        end do
+      end do
+    end do
+    do k = 1,nz
+      do j = jsd,jed+shift
+        do i = isd,ied
+          fd_sum = fd_sum + y_fd(i,j,k)*y_fd(i,j,k)
+        end do
+      end do
+    end do
+    call mpp_sum( fd_sum )
+
+    x_ad = x_fd
+    y_ad = y_fd
+    call mpp_update_domains_ad( x_ad, y_ad, domain, gridtype=CGRID_NE)
+
+    ad_sum = 0.0
+    do k = 1,nz
+      do j = jsd,jed
+        do i = isd,ied+shift
+          ad_sum = ad_sum + x_ad(i,j,k)*x_save(i,j,k)
+        end do
+      end do
+    end do
+    do k = 1,nz
+      do j = jsd,jed+shift
+        do i = isd,ied
+          ad_sum = ad_sum + y_ad(i,j,k)*y_save(i,j,k)
+        end do
+      end do
+    end do
+    call mpp_sum( ad_sum )
+    call mpp_sync()
+    pe = mpp_pe()
+    sum_diff = 0.0
+    sum_diff = abs(ad_sum-fd_sum)/fd_sum
+
+    if ( pe.EQ.mpp_root_pe() ) then
+      if (sum_diff .lt. 1E-7) then
+        call MPP_ERROR(NOTE, "Passed Adjoint Dot Test: mpp_update_domains_ad_r8(vector 3D fields)")
+      else
+        call MPP_ERROR(FATAL,"FAILED Adjoint Dot Test: mpp_update_domains_ad_r8(vector 3D fields)")
+      endif
+    endif
+    deallocate (x_ad, y_ad, x_fd, y_fd, x_save, y_save)
+
+  end subroutine test_halo_update_ad_r8
+
+  !> test calling mpp_halo_update_ad on a 3D 32-bit real data array
+  subroutine test_halo_update_ad_r4( test_type )
+    character(len=*), intent(in) :: test_type
+    ! local
+    type(domain2D) :: domain
+    integer              :: shift, i, j, k
+    logical              :: is_symmetry
+    integer              :: is, ie, js, je, isd, ied, jsd, jed, pe
+    real(kind=r4_kind), allocatable, dimension(:,:,:) :: x_ad, y_ad, x_fd, y_fd, x_save, y_save
+    real(kind=r4_kind) :: ad_sum, fd_sum, sum_diff
+
+    if(index(test_type, 'symmetry') == 0) then
+       is_symmetry = .false.
+    else
+       is_symmetry = .true.
+    end if
+    select case(test_type)
+    case( 'Simple', 'Simple symmetry' )
+      call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+      call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+                              shalo=shalo, nhalo=nhalo, name=test_type, symmetry = is_symmetry )
+    case default
+      call mpp_error( FATAL, 'test_mpp_update_domains_ad_r4: '//test_type//' is not a valid test.')
+    end select
+
+    ! set up the x array
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+
+    shift=1
+!---test 3d single fields----------------------------------------------------------
+    allocate( x_fd(isd:ied,jsd:jed,nz) )
+    allocate( x_ad(isd:ied,jsd:jed,nz) )
+    allocate( x_save(isd:ied,jsd:jed,nz) )
+    x_fd = 0.0; x_ad = 0.0; x_save = 0.0
+
+    do k = 1,nz
+      do j = js,je
+        do i = is,ie
+          x_fd(i,j,k) = i*j
+        end do
+      end do
+    end do
+    x_save = x_fd
+
+    ! full update
+    call mpp_update_domains( x_fd, domain )
+
+    fd_sum = 0.0
+    do k = 1,nz
+      do j = jsd,jed
+        do i = isd,ied
+          fd_sum = fd_sum + x_fd(i,j,k)*x_fd(i,j,k)
+        end do
+      end do
+    end do
+    call mpp_sum( fd_sum )
+
+    x_ad = x_fd
+    call mpp_update_domains_ad( x_ad, domain )
+
+    ad_sum = 0.0
+    do k = 1,nz
+      do j = jsd,jed
+        do i = isd,ied
+          ad_sum = ad_sum + x_ad(i,j,k)*x_save(i,j,k)
+        end do
+      end do
+    end do
+    call mpp_sum( ad_sum )
+    call mpp_sync()
+    pe = mpp_pe()
+    sum_diff = 0.0
+    sum_diff = abs(ad_sum-fd_sum)/fd_sum
+
+    if( pe.EQ.mpp_root_pe() ) then
+      !> @note: threshold differs from R8 test threshold because of expected
+      !! increase in roundoff with reduced precision
+      if (sum_diff .lt. 1E-6) then
+        call MPP_ERROR(NOTE, "Passed Adjoint Dot Test: mpp_update_domains_ad_r4(single 3D field)")
+      else
+        call MPP_ERROR(NOTE, "FAILED Adjoint Dot Test: mpp_update_domains_ad_r4(single 3D field):")
+      endif
+    endif
+
+    deallocate (x_ad, x_fd, x_save)
+
+    ! test 3d vector fields
+    allocate( x_ad  (isd:ied+shift,jsd:jed  ,nz) )
+    allocate( x_fd  (isd:ied+shift,jsd:jed  ,nz) )
+    allocate( x_save(isd:ied+shift,jsd:jed  ,nz) )
+    allocate( y_ad  (isd:ied  ,jsd:jed+shift,nz) )
+    allocate( y_fd  (isd:ied  ,jsd:jed+shift,nz) )
+    allocate( y_save(isd:ied  ,jsd:jed+shift,nz) )
+
+    x_fd=0; y_fd=0
+    do k = 1,nz
+      do j = js,je
+        do i = is,ie
+          x_fd(i,j,k)=i*j
+          y_fd(i,j,k)=i*j
+        end do
+      end do
+    end do
+
+    call mpp_update_domains( x_fd, y_fd, domain, gridtype=CGRID_NE)
+    x_save=x_fd
+    y_save=y_fd
+
+    fd_sum = 0.
+    do k = 1,nz
+      do j = jsd,jed
+        do i = isd,ied+shift
+          fd_sum = fd_sum + x_fd(i,j,k)*x_fd(i,j,k)
+        end do
+      end do
+    end do
+    do k = 1,nz
+      do j = jsd,jed+shift
+        do i = isd,ied
+          fd_sum = fd_sum + y_fd(i,j,k)*y_fd(i,j,k)
+        end do
+      end do
+    end do
+    call mpp_sum( fd_sum )
+
+    x_ad = x_fd
+    y_ad = y_fd
+    call mpp_update_domains_ad( x_ad, y_ad, domain, gridtype=CGRID_NE)
+
+    ad_sum = 0.0
+    do k = 1,nz
+      do j = jsd,jed
+        do i = isd,ied+shift
+          ad_sum = ad_sum + x_ad(i,j,k)*x_save(i,j,k)
+        end do
+      end do
+    end do
+    do k = 1,nz
+      do j = jsd,jed+shift
+        do i = isd,ied
+          ad_sum = ad_sum + y_ad(i,j,k)*y_save(i,j,k)
+        end do
+      end do
+    end do
+    call mpp_sum( ad_sum )
+    call mpp_sync()
+    pe = mpp_pe()
+    sum_diff = 0.0
+    sum_diff = abs(ad_sum-fd_sum)/fd_sum
+
+    if ( pe.EQ.mpp_root_pe() ) then
+      !> @note: threshold differs from R8 test threshold because of expected increase
+      !! in roundoff error with reduced precision
+      if (sum_diff .lt. 1E-6) then
+        call MPP_ERROR(NOTE, "Passed Adjoint Dot Test: mpp_update_domains_ad_r4(vector 3D fields)")
+      else
+        call MPP_ERROR(NOTE, "FAILED Adjoint Dot Test: mpp_update_domains_ad_r4(vector 3D fields)")
+      endif
+    endif
+    deallocate (x_ad, y_ad, x_fd, y_fd, x_save, y_save)
+
+  end subroutine test_halo_update_ad_r4
+end program test_mpp_update_domains_ad
diff --git a/test_fms/mpp/test_mpp_update_domains_ad.sh b/test_fms/mpp/test_mpp_update_domains_ad.sh
new file mode 100755
index 0000000000..1cfe21dc59
--- /dev/null
+++ b/test_fms/mpp/test_mpp_update_domains_ad.sh
@@ -0,0 +1,47 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Jessica Liptak
+
+# Set common test settings.
+. ../test_common.sh
+# Run the test for one processor
+rm -f input.nml
+touch input.nml
+
+#echo "Running test_mpp_update_domains_ad with 1 pe"
+#run_test test_mpp_update_domains_ad 1
+# If on a Linux system that uses the command `nproc`, run the test
+if [ $(command -v nproc) ]
+ # Looks like a linux system
+ then
+   # Get the number of available CPUs on the system
+   nProc=$(nproc)
+   if [ ${nProc} -ge 4 ]
+     then
+       # Run the test with 4 pes
+       echo "Running test_mpp_update_domains_ad with 4 pes"
+       run_test test_mpp_update_domains_ad 4
+   fi
+fi
diff --git a/test_fms/mpp/test_mpp_update_domains_int.F90 b/test_fms/mpp/test_mpp_update_domains_int.F90
new file mode 100644
index 0000000000..e0c7434562
--- /dev/null
+++ b/test_fms/mpp/test_mpp_update_domains_int.F90
@@ -0,0 +1,365 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Jessica Liptak
+!> @brief Test mpp_update_domains on arrays of integers using different layouts and data precision
+!> @note This test is an extension of the routine test_halo_upate in test_mpp_domains.
+module test_mpp_update_domains_int
+
+!  use compare_data_checksums, only : compare_checksums
+  use fill_halo
+  use compare_data_checksums_int, only : compare_checksums => compare_checksums_int
+  use mpp_mod, only : FATAL, WARNING, MPP_DEBUG, NOTE, MPP_CLOCK_SYNC,MPP_CLOCK_DETAILED
+  use mpp_mod, only : mpp_pe, mpp_npes, mpp_root_pe, mpp_error, mpp_sync_self
+  use mpp_mod, only : mpp_clock_id, mpp_clock_begin, mpp_clock_end
+  use mpp_mod, only : mpp_declare_pelist, mpp_set_current_pelist, mpp_set_stack_size
+  use mpp_mod, only : mpp_broadcast, mpp_transmit, mpp_sum, mpp_max, mpp_chksum, ALL_PES
+  use mpp_domains_mod, only : GLOBAL_DATA_DOMAIN, BITWISE_EXACT_SUM, BGRID_NE, CGRID_NE, DGRID_NE, AGRID
+  use mpp_domains_mod, only : FOLD_SOUTH_EDGE, FOLD_NORTH_EDGE, FOLD_WEST_EDGE, FOLD_EAST_EDGE
+  use mpp_domains_mod, only : MPP_DOMAIN_TIME, CYCLIC_GLOBAL_DOMAIN, NUPDATE,EUPDATE, XUPDATE, YUPDATE, SCALAR_PAIR
+  use mpp_domains_mod, only : domain1D, domain2D, DomainCommunicator2D, BITWISE_EFP_SUM
+  use mpp_domains_mod, only : mpp_get_compute_domain, mpp_get_data_domain
+  use mpp_domains_mod, only : mpp_global_field, mpp_global_sum, mpp_global_max, mpp_global_min
+  use mpp_domains_mod, only : mpp_broadcast_domain
+  use mpp_domains_mod, only : mpp_update_domains, mpp_check_field, mpp_redistribute, mpp_get_memory_domain
+  use mpp_domains_mod, only : mpp_define_layout, mpp_define_domains, mpp_modify_domain
+  use mpp_domains_mod, only : mpp_get_neighbor_pe, mpp_define_mosaic, mpp_nullify_domain_list
+  use mpp_domains_mod, only : NORTH, NORTH_EAST, EAST, SOUTH_EAST, CORNER, CENTER
+  use mpp_domains_mod, only : SOUTH, SOUTH_WEST, WEST, NORTH_WEST, mpp_define_mosaic_pelist
+  use mpp_domains_mod, only : mpp_get_global_domain, ZERO, NINETY, MINUS_NINETY
+  use mpp_domains_mod, only : mpp_deallocate_domain
+  use mpp_io_mod, only: mpp_io_init
+  use platform_mod, only: i4_kind, i8_kind
+
+  implicit none
+  private
+  integer :: id
+  integer :: nx=64, ny=64, nz=10, stackmax=10000000
+  integer :: layout(2)
+  integer :: mpes = 0
+  integer :: whalo = 2, ehalo = 2, shalo = 2, nhalo = 2
+  integer :: x_cyclic_offset = 3   ! to be used in test_cyclic_offset
+  integer :: y_cyclic_offset = -4  ! to be used in test_cyclic_offset
+  character(len=32) :: warn_level = "fatal"
+  integer :: wide_halo_x = 0, wide_halo_y = 0
+  integer :: nx_cubic = 0, ny_cubic = 0
+  integer :: ensemble_size = 1
+  integer :: layout_cubic(2) = (/0,0/)
+  integer :: layout_tripolar(2) = (/0,0/)
+  integer :: layout_ensemble(2) = (/0,0/)
+  integer :: n
+  integer :: stdunit = 6
+
+  public :: test_halo_update_i8, test_halo_update_i4, test_subset_update_i8, test_subset_update_i4
+
+  contains
+
+  !> Perform simple addition on arrays of 64-bit integersin different domain configurations and update the domains
+  subroutine test_halo_update_i8( domain_type )
+    character(len=*), intent(in) :: domain_type !< the domain type that will be tested
+    integer(kind=i8_kind), allocatable, dimension(:,:,:) :: xi8, x1i8, x2i8, x3i8, x4i8
+    type(domain2D) :: domain
+    integer(kind=i8_kind),    allocatable :: globali8(:,:,:)
+    integer              :: shift, xhalo, yhalo
+    logical              :: is_symmetry
+    integer              :: is, ie, js, je, isd, ied, jsd, jed
+    integer(kind=i8_kind) :: i, j, k ! kind specified because i,j,k define the x#i8 and globali8 values
+    integer :: pe, npes
+
+    pe = mpp_pe()
+    npes = mpp_npes()
+
+    allocate(globali8(1-whalo:nx+ehalo,1-shalo:ny+nhalo,nz) )
+    ! populate the global array
+    globali8 = 0.0
+    do k = 1,nz
+      do j = 1,ny
+        do i = 1,nx
+          globali8(i,j,k) = k + i*1e-3 + j*1e-6
+        end do
+      end do
+    end do
+
+    if(index(domain_type, 'symmetry') == 0) then
+       is_symmetry = .false.
+    else
+       is_symmetry = .true.
+    end if
+    select case(trim(domain_type))
+      case( 'Simple', 'Simple symmetry' )
+        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+                                 shalo=shalo, nhalo=nhalo, name=trim(domain_type), symmetry = is_symmetry )
+      case( 'Cyclic', 'Cyclic symmetry' )
+        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,        &
+             shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN, &
+             name=trim(domain_type), symmetry = is_symmetry )
+        globali8(1-whalo:0,                 1:ny,:) = globali8(nx-whalo+1:nx,             1:ny,:)
+        globali8(nx+1:nx+ehalo,             1:ny,:) = globali8(1:ehalo,                   1:ny,:)
+        globali8(1-whalo:nx+ehalo,     1-shalo:0,:) = globali8(1-whalo:nx+ehalo, ny-shalo+1:ny,:)
+        globali8(1-whalo:nx+ehalo, ny+1:ny+nhalo,:) = globali8(1-whalo:nx+ehalo,       1:nhalo,:)
+      case default
+        call mpp_error( FATAL, 'test_halo_update_i8: no such test: '//domain_type )
+    end select
+
+    ! define the arrays
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+    allocate(xi8(isd:ied,jsd:jed,nz) )
+    allocate(x1i8(isd:ied,jsd:jed,nz) )
+    allocate(x2i8(isd:ied,jsd:jed,nz) )
+    allocate(x3i8(isd:ied,jsd:jed,nz) )
+    allocate(x4i8(isd:ied,jsd:jed,nz) )
+    xi8(:,:,:) = 0.0
+    xi8 (is:ie,js:je,:) = globali8(is:ie,js:je,:)
+    x1i8 = xi8; x2i8 = xi8; x3i8 = xi8; x4i8 = xi8
+
+    ! update the halo region
+    id = mpp_clock_id( domain_type, flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    call mpp_clock_begin(id)
+    call mpp_update_domains( xi8, domain )
+    call mpp_clock_end  (id)
+    call compare_checksums( xi8, globali8(isd:ied,jsd:jed,:), domain_type )
+
+    ! update part of the halo region
+    id = mpp_clock_id( domain_type//' partial', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    call mpp_clock_begin(id)
+    call mpp_update_domains( x1i8, domain, NUPDATE+EUPDATE, complete=.false. )
+    call mpp_update_domains( x2i8, domain, NUPDATE+EUPDATE, complete=.false. )
+    call mpp_update_domains( x3i8, domain, NUPDATE+EUPDATE, complete=.false. )
+    call mpp_update_domains( x4i8, domain, NUPDATE+EUPDATE, complete=.true. )
+    call mpp_clock_end  (id)
+    call compare_checksums( x1i8(is:ied,js:jed,:), globali8(is:ied,js:jed,:), domain_type//' partial x1i8' )
+    call compare_checksums( x2i8(is:ied,js:jed,:), globali8(is:ied,js:jed,:), domain_type//' partial x2i8' )
+    call compare_checksums( x3i8(is:ied,js:jed,:), globali8(is:ied,js:jed,:), domain_type//' partial x3i8' )
+    call compare_checksums( x4i8(is:ied,js:jed,:), globali8(is:ied,js:jed,:), domain_type//' partial x4i8' )
+
+    deallocate(globali8, xi8, x1i8, x2i8, x3i8, x4i8)
+
+  end subroutine test_halo_update_i8
+
+  !> Perform simple addition on 32-bit real arrays in different domain configurations and update the domains
+  subroutine test_halo_update_i4( domain_type )
+   character(len=*), intent(in) :: domain_type !< the domain type that will be tested
+   integer(kind=i4_kind), allocatable, dimension(:,:,:) :: xi4, x1i4, x2i4, x3i4, x4i4
+   type(domain2D) :: domain
+   integer(kind=i4_kind),    allocatable :: globali4(:,:,:)
+   integer              :: shift, xhalo, yhalo
+   logical              :: is_symmetry
+   integer              :: is, ie, js, je, isd, ied, jsd, jed
+   integer(kind=i4_kind) :: i, j, k ! kind specified because i,j,k define the x#i4 and globali4 values
+   integer :: pe, npes
+
+   pe = mpp_pe()
+   npes = mpp_npes()
+
+   allocate(globali4(1-whalo:nx+ehalo,1-shalo:ny+nhalo,nz) )
+
+   globali4 = 0
+   do k = 1,nz
+     do j = 1,ny
+       do i = 1,nx
+         globali4(i,j,k) = k + i*1e-3 + j*1e-6
+       end do
+     end do
+   end do
+
+   if(index(domain_type, 'symmetry') == 0) then
+     is_symmetry = .false.
+   else
+     is_symmetry = .true.
+   end if
+   select case(trim(domain_type))
+   case( 'Simple', 'Simple symmetry' )
+     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+                             shalo=shalo, nhalo=nhalo, name=trim(domain_type), symmetry = is_symmetry )
+   case( 'Cyclic', 'Cyclic symmetry' )
+     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,        &
+          shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN, &
+            name=trim(domain_type), symmetry = is_symmetry )
+     globali4(1-whalo:0,                 1:ny,:) = globali4(nx-whalo+1:nx,             1:ny,:)
+     globali4(nx+1:nx+ehalo,             1:ny,:) = globali4(1:ehalo,                   1:ny,:)
+     globali4(1-whalo:nx+ehalo,     1-shalo:0,:) = globali4(1-whalo:nx+ehalo, ny-shalo+1:ny,:)
+     globali4(1-whalo:nx+ehalo, ny+1:ny+nhalo,:) = globali4(1-whalo:nx+ehalo,       1:nhalo,:)
+   case default
+     call mpp_error( FATAL, 'test_halo_update_i4: '//domain_type//' is not a valid test.')
+   end select
+   ! define the arrays
+   call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+   call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+   allocate(xi4(isd:ied,jsd:jed,nz) )
+   allocate(x1i4(isd:ied,jsd:jed,nz) )
+   allocate(x2i4(isd:ied,jsd:jed,nz) )
+   allocate(x3i4(isd:ied,jsd:jed,nz) )
+   allocate(x4i4(isd:ied,jsd:jed,nz) )
+   xi4 = 0.0
+   xi4 (is:ie,js:je,:) = globali4(is:ie,js:je,:)
+   x1i4 = xi4; x2i4 = xi4; x3i4 = xi4; x4i4 = xi4
+   ! update the halo region
+   id = mpp_clock_id( domain_type, flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+   call mpp_clock_begin(id)
+   call mpp_update_domains( xi4, domain )
+   call mpp_clock_end  (id)
+   call compare_checksums( xi4, globali4(isd:ied,jsd:jed,:), domain_type )
+   ! update part of the halo region
+   id = mpp_clock_id( domain_type//' partial', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+   call mpp_clock_begin(id)
+   call mpp_update_domains( x1i4, domain, NUPDATE+EUPDATE, complete=.false. )
+   call mpp_update_domains( x2i4, domain, NUPDATE+EUPDATE, complete=.false. )
+   call mpp_update_domains( x3i4, domain, NUPDATE+EUPDATE, complete=.false. )
+   call mpp_update_domains( x4i4, domain, NUPDATE+EUPDATE, complete=.true. )
+   call mpp_clock_end  (id)
+   call compare_checksums( x1i4(is:ied,js:jed,:), globali4(is:ied,js:jed,:), domain_type//' partial x1i4' )
+   call compare_checksums( x2i4(is:ied,js:jed,:), globali4(is:ied,js:jed,:), domain_type//' partial x2i4' )
+   call compare_checksums( x3i4(is:ied,js:jed,:), globali4(is:ied,js:jed,:), domain_type//' partial x3i4' )
+   call compare_checksums( x4i4(is:ied,js:jed,:), globali4(is:ied,js:jed,:), domain_type//' partial x4i4' )
+
+   deallocate(globali4, xi4, x1i4, x2i4, x3i4, x4i4)
+
+ end subroutine test_halo_update_i4
+
+ !> test a domain update of a 3D array of 64-bit integers on a 9-pe subset of total allotted pes
+ !> @note requires at least 16 pes
+ subroutine test_subset_update_i8( )
+   integer(kind=i8_kind), allocatable, dimension(:,:,:) :: x
+   type(domain2D) :: domain
+   integer(kind=i8_kind), allocatable :: global(:,:,:)
+   integer              :: xhalo, yhalo
+   integer              :: is, ie, js, je, isd, ied, jsd, jed
+   integer :: pes9(9)=(/1,2,4,6,8,10,12,13,15/)
+   integer :: ni, nj
+   integer(kind=i8_kind) :: i, j, k ! kind specified because i,j,k define the x and global values
+   integer :: pe, npes
+
+   pe = mpp_pe()
+   npes = mpp_npes()
+
+   call mpp_declare_pelist(pes9)
+   if(any(mpp_pe()==pes9)) then
+     call mpp_set_current_pelist(pes9)
+     layout = (/3,3/)
+     ni = 3; nj =3
+     call mpp_define_domains((/1,ni,1,nj/), layout, domain, xhalo=1 &
+       &, yhalo=1, xflags=CYCLIC_GLOBAL_DOMAIN, yflags&
+       &=CYCLIC_GLOBAL_DOMAIN, name='subset domain')
+     call mpp_get_compute_domain(domain, is, ie, js, je)
+     print*, "pe=", mpp_pe(), is, ie, js, je
+
+     allocate(global(0:ni+1,0:nj+1,nz) )
+
+     global = 0
+     do k = 1,nz
+       do j = 1,nj
+         do i = 1,ni
+           global(i,j,k) = k + i*1e-3 + j*1e-6
+         end do
+       end do
+     end do
+
+     global(0,      1:nj,:) = global(ni,     1:nj,:)
+     global(ni+1,   1:nj,:) = global(1,      1:nj,:)
+     global(0:ni+1, 0,   :) = global(0:ni+1, nj,  :)
+     global(0:ni+1, nj+1,:) = global(0:ni+1, 1,   :)
+
+     ! set up x array
+     call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+     call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+     allocate( x (isd:ied,jsd:jed,nz) )
+
+     x(:,:,:) = 0.0
+     x (is:ie,js:je,:) = global(is:ie,js:je,:)
+
+     ! full update
+     call mpp_update_domains(x, domain)
+     call compare_checksums(x, global(isd:ied,jsd:jed,:), '64-bit array 9 pe subset')
+
+     deallocate(x, global)
+     call mpp_deallocate_domain(domain)
+   endif
+
+   call mpp_set_current_pelist()
+
+  end subroutine test_subset_update_i8
+
+ !> test a domain update of a 3D array of 32-bit integers on a 9-pe subset of total allotted pes
+ !> @note requires at least 16 pes
+ subroutine test_subset_update_i4( )
+   integer(kind=i4_kind), allocatable, dimension(:,:,:) :: x
+   type(domain2D) :: domain
+   integer(kind=i4_kind), allocatable :: global(:,:,:)
+   integer              :: xhalo, yhalo
+   integer              :: is, ie, js, je, isd, ied, jsd, jed
+   integer :: pes9(9)=(/1,2,4,6,8,10,12,13,15/)
+   integer :: ni, nj
+   integer(kind=i4_kind) :: i, j, k ! kind specified because i,j,k define the xand globalvalues
+   integer :: pe, npes
+
+   pe = mpp_pe()
+   npes = mpp_npes()
+
+   call mpp_declare_pelist(pes9)
+   if(any(mpp_pe()==pes9)) then
+     call mpp_set_current_pelist(pes9)
+     layout = (/3,3/)
+     ni = 3; nj =3
+     call mpp_define_domains((/1,ni,1,nj/), layout, domain, xhalo=1 &
+       &, yhalo=1, xflags=CYCLIC_GLOBAL_DOMAIN, yflags&
+       &=CYCLIC_GLOBAL_DOMAIN, name='subset domain')
+     call mpp_get_compute_domain(domain, is, ie, js, je)
+     print*, "pe=", mpp_pe(), is, ie, js, je
+
+     allocate(global(0:ni+1,0:nj+1,nz) )
+
+     global = 0
+     do k = 1,nz
+       do j = 1,nj
+         do i = 1,ni
+           global(i,j,k) = k + i*1e-3 + j*1e-6
+         end do
+       end do
+     end do
+
+     global(0,      1:nj,:) = global(ni,     1:nj,:)
+     global(ni+1,   1:nj,:) = global(1,      1:nj,:)
+     global(0:ni+1, 0,   :) = global(0:ni+1, nj,  :)
+     global(0:ni+1, nj+1,:) = global(0:ni+1, 1,   :)
+
+     ! set up x array
+     call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+     call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+     allocate( x (isd:ied,jsd:jed,nz) )
+
+     x(:,:,:) = 0.0
+     x (is:ie,js:je,:) = global(is:ie,js:je,:)
+
+     ! full update
+     call mpp_update_domains(x, domain)
+     call compare_checksums(x, global(isd:ied,jsd:jed,:), '32-bit array on 9 pe subset')
+
+     deallocate(x, global)
+     call mpp_deallocate_domain(domain)
+   endif
+
+   call mpp_set_current_pelist()
+
+  end subroutine test_subset_update_i4
+
+end module test_mpp_update_domains_int
\ No newline at end of file
diff --git a/test_fms/mpp/test_mpp_update_domains_main.F90 b/test_fms/mpp/test_mpp_update_domains_main.F90
new file mode 100644
index 0000000000..0eb5223df1
--- /dev/null
+++ b/test_fms/mpp/test_mpp_update_domains_main.F90
@@ -0,0 +1,99 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Jessica Liptak
+!> @brief run mpp_domains tests on arrays of integers and real numbers
+!! using different layouts and data precision
+!> @note This test calls extensions of the routine test_halo_upate in test_mpp_domains.
+program test_mpp_update_domains_main
+
+  use test_mpp_update_domains_real, only : test_halo_update_r8, test_halo_update_r4
+  use test_mpp_update_domains_real, only : test_subset_update_r8, test_subset_update_r4
+  use test_mpp_update_domains_int , only : test_halo_update_i8, test_halo_update_i4
+  use test_mpp_update_domains_int, only : test_subset_update_i8, test_subset_update_i4
+  use mpp_mod, only : mpp_init, mpp_pe, mpp_npes, mpp_root_pe
+  use mpp_mod, only : mpp_set_stack_size
+  use mpp_mod, only : mpp_init_test_requests_allocated
+  use mpp_domains_mod, only : MPP_DOMAIN_TIME, mpp_domains_set_stack_size
+  use mpp_domains_mod, only : mpp_domains_init, mpp_domains_exit
+  use mpp_io_mod, only: mpp_io_init
+  use platform_mod
+
+  implicit none
+
+  integer :: ierr
+  integer :: stackmax=10000000
+  !> Initialize mpp and mpp IO modules
+  call mpp_init(test_level=mpp_init_test_requests_allocated)
+  call mpp_domains_init(MPP_DOMAIN_TIME)
+  call mpp_io_init()
+  call mpp_domains_set_stack_size(stackmax)
+  !> run the tests
+  !> run the tests
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Calling real test_halo_update <-------------------'
+
+  call test_halo_update_r4( 'Simple' ) !includes global field, global sum tests
+  call test_halo_update_r4( 'Cyclic' )
+  call test_halo_update_r4( 'Folded-north' ) !includes vector field test
+  call test_halo_update_r4( 'Masked' ) !includes vector field test
+  call test_halo_update_r4( 'Folded xy_halo' ) !
+  call test_halo_update_r4( 'Simple symmetry' ) !includes global field, global sum tests
+  call test_halo_update_r4( 'Cyclic symmetry' )
+  call test_halo_update_r4( 'Folded-north symmetry' ) !includes vector field test
+  call test_halo_update_r4( 'Folded-south symmetry' ) !includes vector field test
+  call test_halo_update_r4( 'Folded-west symmetry' ) !includes vector field test
+  call test_halo_update_r4( 'Folded-east symmetry' ) !includes vector field test
+
+  call test_halo_update_r8( 'Simple' ) !includes global field, global sum tests
+  call test_halo_update_r8( 'Cyclic' )
+  call test_halo_update_r8( 'Folded-north' ) !includes vector field test
+  call test_halo_update_r8( 'Masked' ) !includes vector field test
+  call test_halo_update_r8( 'Folded xy_halo' ) !
+  call test_halo_update_r8( 'Simple symmetry' ) !includes global field, global sum tests
+  call test_halo_update_r8( 'Cyclic symmetry' )
+  call test_halo_update_r8( 'Folded-north symmetry' ) !includes vector field test
+  call test_halo_update_r8( 'Folded-south symmetry' ) !includes vector field test
+  call test_halo_update_r8( 'Folded-west symmetry' ) !includes vector field test
+  call test_halo_update_r8( 'Folded-east symmetry' ) !includes vector field test
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Calling integer test_halo_update <-------------------'
+  !> @note mpp_update_domains vector interfaces are not defined for integer data types
+  call test_halo_update_i4( 'Simple' ) !includes global field, global sum tests
+  call test_halo_update_i4( 'Cyclic' )
+  call test_halo_update_i4( 'Simple symmetry' ) !includes global field, global sum tests
+  call test_halo_update_i4( 'Cyclic symmetry' )
+  call test_halo_update_i8( 'Simple' ) !includes global field, global sum tests
+  call test_halo_update_i8( 'Cyclic' )
+  call test_halo_update_i8( 'Simple symmetry' ) !includes global field, global sum tests
+  call test_halo_update_i8( 'Cyclic symmetry' )
+  ! pe subset test
+  !> @todo resolve issue. This test triggers an error in mpp_clock_begin called by mpp_update_domains
+  !! cannot change pelist context of a clock.
+  if (mpp_npes() .GE. 16) then
+    if (mpp_pe() == mpp_root_pe()) &
+      print *, '--------------------> Calling test_subset_update <-------------------'
+    call test_subset_update_r8
+    call test_subset_update_r4
+    call test_subset_update_i8
+    call test_subset_update_i4
+  endif
+  call mpp_domains_exit()
+  !> Finalize mpp
+  call MPI_FINALIZE(ierr)
+end program test_mpp_update_domains_main
\ No newline at end of file
diff --git a/test_fms/mpp/test_mpp_update_domains_real.F90 b/test_fms/mpp/test_mpp_update_domains_real.F90
new file mode 100644
index 0000000000..f2ccd2c2a8
--- /dev/null
+++ b/test_fms/mpp/test_mpp_update_domains_real.F90
@@ -0,0 +1,979 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Jessica Liptak
+!> @brief Test mpp_update_domains on arrays of real numbers using different layouts and data precision
+!> @note This test is an extension of the routine test_halo_upate in test_mpp_domains.
+module test_mpp_update_domains_real
+
+  use compare_data_checksums, only : compare_checksums
+  use fill_halo
+  use mpp_mod, only : FATAL, WARNING, MPP_DEBUG, NOTE, MPP_CLOCK_SYNC,MPP_CLOCK_DETAILED
+  use mpp_mod, only : mpp_pe, mpp_npes, mpp_root_pe, mpp_error
+  use mpp_mod, only : mpp_clock_id, mpp_clock_begin, mpp_clock_end, mpp_sync
+  use mpp_mod, only : mpp_declare_pelist, mpp_set_current_pelist, mpp_set_stack_size
+  use mpp_mod, only : mpp_broadcast, mpp_transmit, mpp_sum, mpp_max, mpp_chksum, ALL_PES
+  use mpp_mod, only : mpp_gather, mpp_sync_self
+  use mpp_domains_mod, only : GLOBAL_DATA_DOMAIN, BITWISE_EXACT_SUM, BGRID_NE, CGRID_NE, DGRID_NE, AGRID
+  use mpp_domains_mod, only : FOLD_SOUTH_EDGE, FOLD_NORTH_EDGE, FOLD_WEST_EDGE, FOLD_EAST_EDGE
+  use mpp_domains_mod, only : MPP_DOMAIN_TIME, CYCLIC_GLOBAL_DOMAIN, NUPDATE,EUPDATE, XUPDATE, YUPDATE, SCALAR_PAIR
+  use mpp_domains_mod, only : domain1D, domain2D, DomainCommunicator2D, BITWISE_EFP_SUM
+  use mpp_domains_mod, only : mpp_get_compute_domain, mpp_get_data_domain
+  use mpp_domains_mod, only : mpp_global_field, mpp_global_sum, mpp_global_max, mpp_global_min
+  use mpp_domains_mod, only : mpp_broadcast_domain
+  use mpp_domains_mod, only : mpp_update_domains, mpp_check_field, mpp_redistribute, mpp_get_memory_domain
+  use mpp_domains_mod, only : mpp_define_layout, mpp_define_domains, mpp_modify_domain
+  use mpp_domains_mod, only : mpp_get_neighbor_pe, mpp_define_mosaic, mpp_nullify_domain_list
+  use mpp_domains_mod, only : NORTH, NORTH_EAST, EAST, SOUTH_EAST, CORNER, CENTER
+  use mpp_domains_mod, only : SOUTH, SOUTH_WEST, WEST, NORTH_WEST, mpp_define_mosaic_pelist
+  use mpp_domains_mod, only : mpp_get_global_domain, ZERO, NINETY, MINUS_NINETY
+  use mpp_domains_mod, only : mpp_deallocate_domain
+  use platform_mod
+
+  implicit none
+  private
+  integer :: id
+  integer :: nx=64, ny=64, nz=10, stackmax=10000000
+  integer :: i, j, k, n
+  integer :: layout(2)
+  integer :: mpes = 0
+  integer :: whalo = 2, ehalo = 2, shalo = 2, nhalo = 2
+  integer :: x_cyclic_offset = 3   ! to be used in test_cyclic_offset
+  integer :: y_cyclic_offset = -4  ! to be used in test_cyclic_offset
+  character(len=32) :: warn_level = "fatal"
+  integer :: wide_halo_x = 0, wide_halo_y = 0
+  integer :: nx_cubic = 0, ny_cubic = 0
+  integer :: ensemble_size = 1
+  integer :: layout_cubic(2) = (/0,0/)
+  integer :: layout_tripolar(2) = (/0,0/)
+  integer :: layout_ensemble(2) = (/0,0/)
+  public :: test_halo_update_r8, test_halo_update_r4, test_subset_update_r8, test_subset_update_r4
+
+  contains
+
+  !> Perform simple addition on 64-bit real arrays in different domain configurations and update the domains
+  subroutine test_halo_update_r8( domain_type )
+    character(len=*), intent(in) :: domain_type !< the domain type that will be tested
+    real(kind=r8_kind), allocatable, dimension(:,:,:) :: xr8, x1r8, x2r8, x3r8, x4r8
+    real(kind=r8_kind), allocatable, dimension(:,:,:) :: yr8, y1r8, y2r8, y3r8, y4r8
+    type(domain2D) :: domain
+    real(kind=r8_kind),    allocatable :: global1r8(:,:,:), global2r8(:,:,:), globalr8(:,:,:)
+    logical, allocatable :: maskmap(:,:)
+    integer              :: shift, i, xhalo, yhalo
+    logical              :: is_symmetry, folded_south, folded_west, folded_east
+    integer              :: is, ie, js, je, isd, ied, jsd, jed
+    integer :: pe, npes
+
+    pe = mpp_pe()
+    npes = mpp_npes()
+    ! when testing maskmap option, nx*ny should be able to be divided by both npes and npes+1
+    if ((domain_type == 'Masked') .OR. (domain_type == 'Masked symmetry')) then
+      if((mod(nx*ny, npes) .NE. 0) .OR. (mod(nx*ny, npes+1) .NE. 0)) then
+        call mpp_error(NOTE,'test_halo_update_r8: nx*ny can not be divided by both npes and npes+1, '//&
+                       'Masked test_halo_update_r8 will not be tested')
+        return
+      end if
+    end if
+
+    if(trim(domain_type) == 'Folded xy_halo' ) then
+      xhalo = max(whalo, ehalo); yhalo = max(shalo, nhalo)
+      allocate(globalr8(1-xhalo:nx+xhalo,1-yhalo:ny+yhalo,nz) )
+    else
+      allocate(globalr8(1-whalo:nx+ehalo,1-shalo:ny+nhalo,nz) )
+    end if
+    ! populate the global array
+    globalr8 = 0.0
+    do k = 1,nz
+      do j = 1,ny
+        do i = 1,nx
+          globalr8(i,j,k) = k + i*1e-3 + j*1e-6
+        end do
+      end do
+    end do
+
+    if(index(domain_type, 'symmetry') == 0) then
+       is_symmetry = .false.
+    else
+       is_symmetry = .true.
+    end if
+    select case(trim(domain_type))
+      case( 'Simple', 'Simple symmetry' )
+        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+                                 shalo=shalo, nhalo=nhalo, name=trim(domain_type), symmetry = is_symmetry )
+      case( 'Cyclic', 'Cyclic symmetry' )
+        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,        &
+             shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN, &
+             name=trim(domain_type), symmetry = is_symmetry )
+        globalr8(1-whalo:0,                 1:ny,:) = globalr8(nx-whalo+1:nx,             1:ny,:)
+        globalr8(nx+1:nx+ehalo,             1:ny,:) = globalr8(1:ehalo,                   1:ny,:)
+        globalr8(1-whalo:nx+ehalo,     1-shalo:0,:) = globalr8(1-whalo:nx+ehalo, ny-shalo+1:ny,:)
+        globalr8(1-whalo:nx+ehalo, ny+1:ny+nhalo,:) = globalr8(1-whalo:nx+ehalo,       1:nhalo,:)
+      case( 'Folded-north', 'Folded-north symmetry' )
+        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+             shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, &
+             name=domain_type, symmetry = is_symmetry  )
+        call fill_folded_north_halo(globalr8, 0, 0, 0, 0, 1)
+      case( 'Folded-south symmetry' )
+        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+             shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_SOUTH_EDGE, &
+             name=domain_type, symmetry = is_symmetry  )
+        call fill_folded_south_halo(globalr8, 0, 0, 0, 0, 1)
+      case( 'Folded-west symmetry' )
+        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+             shalo=shalo, nhalo=nhalo, xflags=FOLD_WEST_EDGE, yflags=CYCLIC_GLOBAL_DOMAIN, &
+             name=domain_type, symmetry = is_symmetry  )
+        call fill_folded_west_halo(globalr8, 0, 0, 0, 0, 1)
+      case( 'Folded-east symmetry' )
+        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+             shalo=shalo, nhalo=nhalo, xflags=FOLD_EAST_EDGE, yflags=CYCLIC_GLOBAL_DOMAIN, &
+             name=domain_type, symmetry = is_symmetry  )
+        call fill_folded_east_halo(globalr8, 0, 0, 0, 0, 1)
+      case( 'Folded xy_halo' )
+        call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, xhalo=xhalo, yhalo=yhalo,   &
+             xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, name=domain_type, symmetry = is_symmetry  )
+        globalr8(1-xhalo:0,                1:ny,:) = globalr8(nx-xhalo+1:nx,                   1:ny,:)
+        globalr8(nx+1:nx+xhalo,            1:ny,:) = globalr8(1:xhalo,                         1:ny,:)
+        globalr8(1-xhalo:nx+xhalo,ny+1:ny+yhalo,:) = globalr8(nx+xhalo:1-xhalo:-1, ny:ny-yhalo+1:-1,:)
+      case( 'Masked', 'Masked symmetry' )
+      !with fold and cyclic, assign to npes+1 and mask out the top-rightdomain
+        call mpp_define_layout( (/1,nx,1,ny/), npes+1, layout )
+        allocate( maskmap(layout(1),layout(2)) )
+        maskmap(:,:) = .TRUE.; maskmap(layout(1),layout(2)) = .FALSE.
+        call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+          shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, &
+          maskmap=maskmap, name=domain_type, symmetry = is_symmetry  )
+        deallocate(maskmap)
+       !we need to zero out the globalr8 data on the missing domain.
+       !this logic assumes top-right, in an even division
+        if( mod(nx,layout(1)).NE.0 .OR. mod(ny,layout(2)).NE.0 )call mpp_error( FATAL, &
+             'TEST_MPP_DOMAINS: test for masked domains needs (nx,ny) to divide evenly on npes+1 PEs.' )
+        globalr8(nx-nx/layout(1)+1:nx,ny-ny/layout(2)+1:ny,:) = 0
+        call fill_folded_north_halo(globalr8, 0, 0, 0, 0, 1)
+      case default
+        call mpp_error( FATAL, 'test_halo_update_r8: no such test: '//domain_type )
+    end select
+
+    ! define the arrays
+    call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+    call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+    allocate(xr8(isd:ied,jsd:jed,nz) )
+    allocate(x1r8(isd:ied,jsd:jed,nz) )
+    allocate(x2r8(isd:ied,jsd:jed,nz) )
+    allocate(x3r8(isd:ied,jsd:jed,nz) )
+    allocate(x4r8(isd:ied,jsd:jed,nz) )
+    xr8(:,:,:) = 0.0
+    xr8 (is:ie,js:je,:) = globalr8(is:ie,js:je,:)
+    x1r8 = xr8; x2r8 = xr8; x3r8 = xr8; x4r8 = xr8
+
+    ! update the halo region
+    id = mpp_clock_id( domain_type, flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    call mpp_clock_begin(id)
+    call mpp_update_domains( xr8, domain )
+    call mpp_clock_end  (id)
+    call compare_checksums( xr8, globalr8(isd:ied,jsd:jed,:), domain_type )
+
+    ! update part of the halo region
+    id = mpp_clock_id( domain_type//' partial', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    call mpp_clock_begin(id)
+    call mpp_update_domains( x1r8, domain, NUPDATE+EUPDATE, complete=.false. )
+    call mpp_update_domains( x2r8, domain, NUPDATE+EUPDATE, complete=.false. )
+    call mpp_update_domains( x3r8, domain, NUPDATE+EUPDATE, complete=.false. )
+    call mpp_update_domains( x4r8, domain, NUPDATE+EUPDATE, complete=.true. )
+    call mpp_clock_end  (id)
+    call compare_checksums( x1r8(is:ied,js:jed,:), globalr8(is:ied,js:jed,:), domain_type//' partial x1r8' )
+    call compare_checksums( x2r8(is:ied,js:jed,:), globalr8(is:ied,js:jed,:), domain_type//' partial x2r8' )
+    call compare_checksums( x3r8(is:ied,js:jed,:), globalr8(is:ied,js:jed,:), domain_type//' partial x3r8' )
+    call compare_checksums( x4r8(is:ied,js:jed,:), globalr8(is:ied,js:jed,:), domain_type//' partial x4r8' )
+
+    !--- test vector update for FOLDED and MASKED case.
+    if(domain_type == 'Simple' .or. domain_type == 'Simple symmetry' .or. domain_type == 'Cyclic' .or. domain_type == 'Cyclic symmetry') then
+      deallocate(xr8,x1r8,x2r8,x3r8,x4r8)
+      return
+    end if
+
+    !------------------------------------------------------------------
+    !              vector update : BGRID_NE
+    !------------------------------------------------------------------
+    shift = 0
+    if(is_symmetry) then
+      shift = 1
+      deallocate(globalr8)
+      allocate(globalr8(1-whalo:nx+ehalo+shift,1-shalo:ny+nhalo+shift,nz) )
+      globalr8(:,:,:) = 0.0
+      do k = 1,nz
+        do j = 1,ny+1
+          do i = 1,nx+1
+            globalr8(i,j,k) = k + i*1e-3 + j*1e-6
+          end do
+        end do
+      end do
+      if(domain_type == 'Masked symmetry') then
+        globalr8(nx-nx/layout(1)+1:nx+1,ny-ny/layout(2)+1:ny+1,:) = 0.0
+      endif
+      deallocate(xr8, x1r8, x2r8, x3r8, x4r8)
+      allocate( xr8(isd:ied+1,jsd:jed+1,nz) )
+      allocate( x1r8(isd:ied+1,jsd:jed+1,nz) )
+      allocate( x2r8(isd:ied+1,jsd:jed+1,nz) )
+      allocate( x3r8(isd:ied+1,jsd:jed+1,nz) )
+      allocate( x4r8(isd:ied+1,jsd:jed+1,nz) )
+    endif
+
+    folded_south = .false.
+    folded_west  = .false.
+    folded_east  = .false.
+    select case (domain_type)
+    case ('Folded-north', 'Masked')
+      ! fill in folded north edge, cyclic east and west edge
+      call fill_folded_north_halo(globalr8, 1, 1, 0, 0, -1)
+    case ('Folded xy_halo')
+      ! fill in folded north edge, cyclic east and west edge
+      globalr8(1-xhalo:0,                  1:ny,:) =  globalr8(nx-xhalo+1:nx,                     1:ny,:)
+      globalr8(nx+1:nx+xhalo,              1:ny,:) =  globalr8(1:xhalo,                           1:ny,:)
+      globalr8(1-xhalo:nx+xhalo-1,ny+1:ny+yhalo,:) = -globalr8(nx+xhalo-1:1-xhalo:-1,ny-1:ny-yhalo:-1,:)
+      globalr8(nx+xhalo,          ny+1:ny+yhalo,:) = -globalr8(nx-xhalo,             ny-1:ny-yhalo:-1,:)
+    case ('Folded-north symmetry', 'Masked symmetry' )
+      call fill_folded_north_halo(globalr8, 1, 1, 1, 1, -1)
+    case ('Folded-south symmetry' )
+      folded_south = .true.
+      call fill_folded_south_halo(globalr8, 1, 1, 1, 1, -1)
+    case ('Folded-west symmetry' )
+      folded_west = .true.
+      call fill_folded_west_halo(globalr8, 1, 1, 1, 1, -1)
+    case ('Folded-east symmetry' )
+      folded_east = .true.
+      call fill_folded_east_halo(globalr8, 1, 1, 1, 1, -1)
+    case default
+      call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//domain_type )
+    end select
+
+    xr8(:,:,:) = 0.0
+    xr8(is:ie+shift,js:je+shift,:) = globalr8(is:ie+shift,js:je+shift,:)
+    !set up yr8 array
+    allocate( yr8 (isd:ied+shift,jsd:jed+shift,nz) )
+    allocate( y1r8(isd:ied+shift,jsd:jed+shift,nz) )
+    allocate( y2r8(isd:ied+shift,jsd:jed+shift,nz) )
+    allocate( y3r8(isd:ied+shift,jsd:jed+shift,nz) )
+    allocate( y4r8(isd:ied+shift,jsd:jed+shift,nz) )
+    yr8 = xr8; x1r8 = xr8; x2r8 = xr8; x3r8 = xr8; x4r8 = xr8
+    yr8 = xr8; y1r8 = xr8; y2r8 = xr8; y3r8 = xr8; y4r8 = xr8
+
+    id = mpp_clock_id( domain_type//' vector BGRID_NE', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    call mpp_clock_begin(id)
+    call mpp_update_domains( xr8,  yr8,  domain, gridtype=BGRID_NE)
+    call mpp_update_domains( x1r8, y1r8, domain, gridtype=BGRID_NE, complete=.false. )
+    call mpp_update_domains( x2r8, y2r8, domain, gridtype=BGRID_NE, complete=.false. )
+    call mpp_update_domains( x3r8, y3r8, domain, gridtype=BGRID_NE, complete=.false. )
+    call mpp_update_domains( x4r8, y4r8, domain, gridtype=BGRID_NE, complete=.true.  )
+    call mpp_clock_end  (id)
+
+    ! redundant points must be equal and opposite
+
+    if(folded_south) then
+      globalr8(nx/2+shift,                1,:) = 0.0  !pole points must have 0 velocity
+      globalr8(nx+shift  ,                1,:) = 0.0  !pole points must have 0 velocity
+      globalr8(nx/2+1+shift:nx-1+shift,   1,:) = -globalr8(nx/2-1+shift:1+shift:-1, 1,:)
+      globalr8(1-whalo:shift,             1,:) = -globalr8(nx-whalo+1:nx+shift,     1,:)
+      globalr8(nx+1+shift:nx+ehalo+shift, 1,:) = -globalr8(1+shift:ehalo+shift,     1,:)
+      !--- the following will fix the +0/-0 problem on altix
+      if(shalo >0) globalr8(shift,1,:) = 0.0  !pole points must have 0 velocity
+    else if(folded_west) then
+      globalr8(1, ny/2+shift, :) = 0.0 !pole points must have 0 velocity
+      globalr8(1, ny+shift,   :) = 0.0 !pole points must have 0 velocity
+      globalr8(1, ny/2+1+shift:ny-1+shift,   :) = -globalr8(1, ny/2-1+shift:1+shift:-1, :)
+      globalr8(1, 1-shalo:shift,             :) = -globalr8(1, ny-shalo+1:ny+shift,     :)
+      globalr8(1, ny+1+shift:ny+nhalo+shift, :) = -globalr8(1, 1+shift:nhalo+shift,     :)
+      !--- the following will fix the +0/-0 problem on altix
+      if(whalo>0) globalr8(1, shift, :) = 0.0  !pole points must have 0 velocity
+    else if(folded_east) then
+      globalr8(nx+shift, ny/2+shift, :) = 0.0 !pole points must have 0 velocity
+      globalr8(nx+shift, ny+shift,   :) = 0.0 !pole points must have 0 velocity
+      globalr8(nx+shift, ny/2+1+shift:ny-1+shift,   :) = -globalr8(nx+shift, ny/2-1+shift:1+shift:-1, :)
+      globalr8(nx+shift, 1-shalo:shift,             :) = -globalr8(nx+shift, ny-shalo+1:ny+shift,     :)
+      globalr8(nx+shift, ny+1+shift:ny+nhalo+shift, :) = -globalr8(nx+shift, 1+shift:nhalo+shift,     :)
+      if(ehalo >0) globalr8(nx+shift, shift, :) = 0.0  !pole points must have 0 velocity
+    else
+      globalr8(nx/2+shift,                ny+shift,:) = 0.0  !pole points must have 0 velocity
+      globalr8(nx+shift  ,                ny+shift,:) = 0.0  !pole points must have 0 velocity
+      globalr8(nx/2+1+shift:nx-1+shift,   ny+shift,:) = -globalr8(nx/2-1+shift:1+shift:-1, ny+shift,:)
+      if (domain_type == 'Folded xy_halo') then
+        globalr8(1-xhalo:shift,             ny+shift,:) = -globalr8(nx-xhalo+1:nx+shift,     ny+shift,:)
+        globalr8(nx+1+shift:nx+xhalo+shift, ny+shift,:) = -globalr8(1+shift:xhalo+shift,     ny+shift,:)
+      else
+        globalr8(1-whalo:shift,             ny+shift,:) = -globalr8(nx-whalo+1:nx+shift,     ny+shift,:)
+        globalr8(nx+1+shift:nx+ehalo+shift, ny+shift,:) = -globalr8(1+shift:ehalo+shift,     ny+shift,:)
+      end if
+      !--- the following will fix the +0/-0 problem on altix
+      if(nhalo >0) globalr8(shift,ny+shift,:) = 0.0  !pole points must have 0 velocity
+    endif
+
+    call compare_checksums( xr8,  globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE xr8' )
+    call compare_checksums( yr8,  globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE yr8' )
+    call compare_checksums( x1r8, globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE x1r8' )
+    call compare_checksums( x2r8, globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE x2r8' )
+    call compare_checksums( x3r8, globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE x3r8' )
+    call compare_checksums( x4r8, globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE x4r8' )
+    call compare_checksums( y1r8, globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE y1r8' )
+    call compare_checksums( y2r8, globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE y2r8' )
+    call compare_checksums( y3r8, globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE y3r8' )
+    call compare_checksums( y4r8, globalr8(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE y4r8' )
+
+    deallocate(globalr8, xr8, x1r8, x2r8, x3r8, x4r8, yr8, y1r8, y2r8, y3r8, y4r8)
+
+    !------------------------------------------------------------------
+    !              vector update : CGRID_NE
+    !------------------------------------------------------------------
+    !--- global1r8 is x-component and global2r8 is y-component
+    if(domain_type == 'Folded xy_halo') then
+      allocate(global1r8(1-xhalo:nx+xhalo, 1-yhalo:ny+yhalo, nz))
+      allocate(global2r8(1-xhalo:nx+xhalo, 1-yhalo:ny+yhalo, nz))
+    else
+      allocate(global1r8(1-whalo:nx+ehalo+shift, 1-shalo:ny+nhalo, nz))
+      allocate(global2r8(1-whalo:nx+ehalo, 1-shalo:ny+nhalo+shift, nz))
+    end if
+    allocate(xr8 (isd:ied+shift,jsd:jed,nz), yr8 (isd:ied,jsd:jed+shift,nz) )
+    allocate(x1r8(isd:ied+shift,jsd:jed,nz), y1r8(isd:ied,jsd:jed+shift,nz) )
+    allocate(x2r8(isd:ied+shift,jsd:jed,nz), y2r8(isd:ied,jsd:jed+shift,nz) )
+    allocate(x3r8(isd:ied+shift,jsd:jed,nz), y3r8(isd:ied,jsd:jed+shift,nz) )
+    allocate(x4r8(isd:ied+shift,jsd:jed,nz), y4r8(isd:ied,jsd:jed+shift,nz) )
+
+    global1r8(:,:,:) = 0.0
+    global2r8(:,:,:) = 0.0
+    do k = 1,nz
+      do j = 1,ny
+        do i = 1,nx+shift
+          global1r8(i,j,k) = k + i*1e-3 + j*1e-6
+        end do
+      end do
+      do j = 1,ny+shift
+        do i = 1,nx
+          global2r8(i,j,k) = k + i*1e-3 + j*1e-6
+        end do
+      end do
+    end do
+
+    if(domain_type == 'Masked' .or. domain_type == 'Masked symmetry') then
+      global1r8(nx-nx/layout(1)+1:nx+shift,ny-ny/layout(2)+1:ny,:) = 0
+      global2r8(nx-nx/layout(1)+1:nx,ny-ny/layout(2)+1:ny+shift,:) = 0
+    end if
+
+    select case (domain_type)
+    case ('Folded-north', 'Masked')
+      !fill in folded north edge, cyclic east and west edge
+      call fill_folded_north_halo(global1r8, 1, 0, 0, 0, -1)
+      call fill_folded_north_halo(global2r8, 0, 1, 0, 0, -1)
+    case ('Folded xy_halo')
+      global1r8(1-xhalo:0,                   1:ny,:) =  global1r8(nx-xhalo+1:nx,                     1:ny,:)
+      global1r8(nx+1:nx+xhalo,               1:ny,:) =  global1r8(1:xhalo,                           1:ny,:)
+     global2r8(1-xhalo:0,                   1:ny,:) =  global2r8(nx-xhalo+1:nx,                     1:ny,:)
+      global2r8(nx+1:nx+xhalo,               1:ny,:) =  global2r8(1:xhalo,                           1:ny,:)
+      global1r8(1-xhalo:nx+xhalo-1, ny+1:ny+yhalo,:) = -global1r8(nx+xhalo-1:1-xhalo:-1, ny:ny-yhalo+1:-1,:)
+      global1r8(nx+xhalo,           ny+1:ny+yhalo,:) = -global1r8(nx-xhalo,              ny:ny-yhalo+1:-1,:)
+      global2r8(1-xhalo:nx+xhalo,   ny+1:ny+yhalo,:) = -global2r8(nx+xhalo:1-xhalo:-1,   ny-1:ny-yhalo:-1,:)
+    case ('Folded-north symmetry')
+      call fill_folded_north_halo(global1r8, 1, 0, 1, 0, -1)
+      call fill_folded_north_halo(global2r8, 0, 1, 0, 1, -1)
+    case ('Folded-south symmetry')
+      call fill_folded_south_halo(global1r8, 1, 0, 1, 0, -1)
+      call fill_folded_south_halo(global2r8, 0, 1, 0, 1, -1)
+    case ('Folded-west symmetry')
+      call fill_folded_west_halo(global1r8, 1, 0, 1, 0, -1)
+      call fill_folded_west_halo(global2r8, 0, 1, 0, 1, -1)
+    case ('Folded-east symmetry')
+      call fill_folded_east_halo(global1r8, 1, 0, 1, 0, -1)
+      call fill_folded_east_halo(global2r8, 0, 1, 0, 1, -1)
+    case default
+       call mpp_error( FATAL, 'test_halo_update_r8: invalid test name: '//domain_type )
+    end select
+
+    xr8(:,:,:) = 0.; yr8(:,:,:) = 0.
+    xr8(is:ie+shift,js:je,      :) = global1r8(is:ie+shift,js:je,      :)
+    yr8(is:ie      ,js:je+shift,:) = global2r8(is:ie,      js:je+shift,:)
+    x1r8 = xr8; x2r8 = xr8; x3r8 = xr8; x4r8 = xr8
+    y1r8 = yr8; y2r8 = yr8; y3r8 = yr8; y4r8 = yr8
+
+    id = mpp_clock_id( domain_type//' vector CGRID_NE', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+    call mpp_clock_begin(id)
+    call mpp_update_domains( xr8,  yr8,  domain, gridtype=CGRID_NE)
+    call mpp_update_domains( x1r8, y1r8, domain, gridtype=CGRID_NE, complete=.false. )
+    call mpp_update_domains( x2r8, y2r8, domain, gridtype=CGRID_NE, complete=.false. )
+    call mpp_update_domains( x3r8, y3r8, domain, gridtype=CGRID_NE, complete=.false. )
+    call mpp_update_domains( x4r8, y4r8, domain, gridtype=CGRID_NE, complete=.true.  )
+    call mpp_clock_end  (id)
+
+    ! redundant points must be equal and opposite
+    if(folded_south) then
+      global2r8(nx/2+1:nx,     1,:) = -global2r8(nx/2:1:-1, 1,:)
+      global2r8(1-whalo:0,     1,:) = -global2r8(nx-whalo+1:nx, 1, :)
+      global2r8(nx+1:nx+ehalo, 1,:) = -global2r8(1:ehalo,       1, :)
+    else if(folded_west) then
+      global1r8(1, ny/2+1:ny,     :) = -global1r8(1, ny/2:1:-1,     :)
+      global1r8(1, 1-shalo:0,     :) = -global1r8(1, ny-shalo+1:ny, :)
+      global1r8(1, ny+1:ny+nhalo, :) = -global1r8(1, 1:nhalo,       :)
+    else if(folded_east) then
+      global1r8(nx+shift, ny/2+1:ny,     :) = -global1r8(nx+shift, ny/2:1:-1,     :)
+      global1r8(nx+shift, 1-shalo:0,     :) = -global1r8(nx+shift, ny-shalo+1:ny, :)
+      global1r8(nx+shift, ny+1:ny+nhalo, :) = -global1r8(nx+shift, 1:nhalo,       :)
+    else
+      global2r8(nx/2+1:nx,     ny+shift,:) = -global2r8(nx/2:1:-1, ny+shift,:)
+      if(domain_type == 'Folded xy_halo') then
+        global2r8(1-xhalo:0,     ny+shift,:) = -global2r8(nx-xhalo+1:nx, ny+shift,:)
+        global2r8(nx+1:nx+xhalo, ny+shift,:) = -global2r8(1:xhalo,       ny+shift,:)
+      else
+        global2r8(1-whalo:0,     ny+shift,:) = -global2r8(nx-whalo+1:nx, ny+shift,:)
+        global2r8(nx+1:nx+ehalo, ny+shift,:) = -global2r8(1:ehalo,       ny+shift,:)
+      end if
+    endif
+
+    call compare_checksums( xr8,  global1r8(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE xr8' )
+    call compare_checksums( yr8,  global2r8(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE yr8' )
+    call compare_checksums( x1r8, global1r8(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE x1r8' )
+    call compare_checksums( x2r8, global1r8(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE x2r8' )
+    call compare_checksums( x3r8, global1r8(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE x3r8' )
+    call compare_checksums( x4r8, global1r8(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE x4r8' )
+    call compare_checksums( y1r8, global2r8(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE y1r8' )
+    call compare_checksums( y2r8, global2r8(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE y2r8' )
+    call compare_checksums( y3r8, global2r8(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE y3r8' )
+    call compare_checksums( y4r8, global2r8(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE y4r8' )
+
+    deallocate(global1r8, global2r8, xr8, x1r8, x2r8, x3r8, x4r8, yr8, y1r8, y2r8, y3r8, y4r8)
+
+  end subroutine test_halo_update_r8
+
+  !> Perform simple addition on 32-bit real arrays in different domain configurations and update the domains
+  subroutine test_halo_update_r4( domain_type )
+   character(len=*), intent(in) :: domain_type !< the domain type that will be tested
+   real(kind=r4_kind), allocatable, dimension(:,:,:) :: xr4, x1r4, x2r4, x3r4, x4r4
+   real(kind=r4_kind), allocatable, dimension(:,:,:) :: yr4, y1r4, y2r4, y3r4, y4r4
+   type(domain2D) :: domain
+   real(kind=r4_kind),    allocatable :: global1r4(:,:,:), global2r4(:,:,:), globalr4(:,:,:)
+   logical, allocatable :: maskmap(:,:)
+   integer              :: shift, i, xhalo, yhalo
+   logical              :: is_symmetry, folded_south, folded_west, folded_east
+   integer              :: is, ie, js, je, isd, ied, jsd, jed
+   integer :: pe, npes
+
+   pe = mpp_pe()
+   npes = mpp_npes()
+
+   ! when testing maskmap option, nx*ny should be able to be divided by both npes and npes+1
+   if((domain_type == 'Masked') .OR. (domain_type == 'Masked symmetry')) then
+      if((mod(nx*ny, npes) .NE. 0) .OR. (mod(nx*ny, npes+1) .NE. 0)) then
+        call mpp_error(NOTE,'test_halo_update_r4: nx*ny can not be divided by both npes and npes+1, '//&
+              'Masked test_halo_update_r4 will not be tested')
+         return
+      end if
+   end if
+
+   if(trim(domain_type) == 'Folded xy_halo' ) then
+     xhalo = max(whalo, ehalo); yhalo = max(shalo, nhalo)
+     allocate(globalr4(1-xhalo:nx+xhalo,1-yhalo:ny+yhalo,nz) )
+   else
+     allocate(globalr4(1-whalo:nx+ehalo,1-shalo:ny+nhalo,nz) )
+   end if
+
+   globalr4 = 0
+   do k = 1,nz
+     do j = 1,ny
+       do i = 1,nx
+         globalr4(i,j,k) = k + i*1e-3 + j*1e-6
+       end do
+     end do
+   end do
+
+   if(index(domain_type, 'symmetry') == 0) then
+     is_symmetry = .false.
+   else
+     is_symmetry = .true.
+   end if
+   select case(trim(domain_type))
+   case( 'Simple', 'Simple symmetry' )
+     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo, &
+                             shalo=shalo, nhalo=nhalo, name=trim(domain_type), symmetry = is_symmetry )
+   case( 'Cyclic', 'Cyclic symmetry' )
+     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,        &
+          shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=CYCLIC_GLOBAL_DOMAIN, &
+            name=trim(domain_type), symmetry = is_symmetry )
+     globalr4(1-whalo:0,                 1:ny,:) = globalr4(nx-whalo+1:nx,             1:ny,:)
+     globalr4(nx+1:nx+ehalo,             1:ny,:) = globalr4(1:ehalo,                   1:ny,:)
+     globalr4(1-whalo:nx+ehalo,     1-shalo:0,:) = globalr4(1-whalo:nx+ehalo, ny-shalo+1:ny,:)
+     globalr4(1-whalo:nx+ehalo, ny+1:ny+nhalo,:) = globalr4(1-whalo:nx+ehalo,       1:nhalo,:)
+   case( 'Folded-north', 'Folded-north symmetry' )
+     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+            shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, &
+            name=domain_type, symmetry = is_symmetry  )
+     call fill_folded_north_halo(globalr4, 0, 0, 0, 0, 1)
+   case( 'Folded-south symmetry' )
+     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+            shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_SOUTH_EDGE, &
+            name=domain_type, symmetry = is_symmetry  )
+     call fill_folded_south_halo(globalr4, 0, 0, 0, 0, 1)
+   case( 'Folded-west symmetry' )
+     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+            shalo=shalo, nhalo=nhalo, xflags=FOLD_WEST_EDGE, yflags=CYCLIC_GLOBAL_DOMAIN, &
+            name=domain_type, symmetry = is_symmetry  )
+     call fill_folded_west_halo(globalr4, 0, 0, 0, 0, 1)
+   case( 'Folded-east symmetry' )
+     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+            shalo=shalo, nhalo=nhalo, xflags=FOLD_EAST_EDGE, yflags=CYCLIC_GLOBAL_DOMAIN, &
+            name=domain_type, symmetry = is_symmetry  )
+     call fill_folded_east_halo(globalr4, 0, 0, 0, 0, 1)
+   case( 'Folded xy_halo' )
+     call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, xhalo=xhalo, yhalo=yhalo,   &
+            xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, name=domain_type, symmetry = is_symmetry)
+       globalr4(1-xhalo:0,                1:ny,:) = globalr4(nx-xhalo+1:nx,                   1:ny,:)
+       globalr4(nx+1:nx+xhalo,            1:ny,:) = globalr4(1:xhalo,                         1:ny,:)
+       globalr4(1-xhalo:nx+xhalo,ny+1:ny+yhalo,:) = globalr4(nx+xhalo:1-xhalo:-1, ny:ny-yhalo+1:-1,:)
+   case( 'Masked', 'Masked symmetry' )
+     ! with fold and cyclic, assign to npes+1 and mask out the top-rightdomain
+     call mpp_define_layout( (/1,nx,1,ny/), npes+1, layout )
+     allocate( maskmap(layout(1),layout(2)) )
+     maskmap(:,:) = .TRUE.; maskmap(layout(1),layout(2)) = .FALSE.
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, whalo=whalo, ehalo=ehalo,   &
+            shalo=shalo, nhalo=nhalo, xflags=CYCLIC_GLOBAL_DOMAIN, yflags=FOLD_NORTH_EDGE, &
+            maskmap=maskmap, name=domain_type, symmetry = is_symmetry  )
+     deallocate(maskmap)
+     ! we need to zero out the globalr4 data on the missing domain.
+     ! this logic assumes top-right, in an even division
+     if ( mod(nx,layout(1)).NE.0 .OR. mod(ny,layout(2)).NE.0) call mpp_error( FATAL, &
+      'test_halo_update_r4: test for masked domains needs (nx,ny) to divide evenly on npes+1 PEs.')
+       globalr4(nx-nx/layout(1)+1:nx,ny-ny/layout(2)+1:ny,:) = 0
+     call fill_folded_north_halo(globalr4, 0, 0, 0, 0, 1)
+   case default
+     call mpp_error( FATAL, 'test_halo_update_r4: '//domain_type//' is not a valid test.')
+   end select
+   ! define the arrays
+   call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+   call mpp_get_data_domain( domain, isd, ied, jsd, jed )
+   allocate(xr4(isd:ied,jsd:jed,nz) )
+   allocate(x1r4(isd:ied,jsd:jed,nz) )
+   allocate(x2r4(isd:ied,jsd:jed,nz) )
+   allocate(x3r4(isd:ied,jsd:jed,nz) )
+   allocate(x4r4(isd:ied,jsd:jed,nz) )
+   xr4 = 0.
+   xr4 (is:ie,js:je,:) = globalr4(is:ie,js:je,:)
+   x1r4 = xr4; x2r4 = xr4; x3r4 = xr4; x4r4 = xr4
+   ! update the halo region
+   id = mpp_clock_id( domain_type, flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+   call mpp_clock_begin(id)
+   call mpp_update_domains( xr4, domain )
+   call mpp_clock_end  (id)
+   call compare_checksums( xr4, globalr4(isd:ied,jsd:jed,:), domain_type )
+   ! update part of the halo region
+   id = mpp_clock_id( domain_type//' partial', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+   call mpp_clock_begin(id)
+   call mpp_update_domains( x1r4, domain, NUPDATE+EUPDATE, complete=.false. )
+   call mpp_update_domains( x2r4, domain, NUPDATE+EUPDATE, complete=.false. )
+   call mpp_update_domains( x3r4, domain, NUPDATE+EUPDATE, complete=.false. )
+   call mpp_update_domains( x4r4, domain, NUPDATE+EUPDATE, complete=.true. )
+   call mpp_clock_end  (id)
+   call compare_checksums( x1r4(is:ied,js:jed,:), globalr4(is:ied,js:jed,:), domain_type//' partial x1r4' )
+   call compare_checksums( x2r4(is:ied,js:jed,:), globalr4(is:ied,js:jed,:), domain_type//' partial x2r4' )
+   call compare_checksums( x3r4(is:ied,js:jed,:), globalr4(is:ied,js:jed,:), domain_type//' partial x3r4' )
+   call compare_checksums( x4r4(is:ied,js:jed,:), globalr4(is:ied,js:jed,:), domain_type//' partial x4r4' )
+
+   !--- test vector update for FOLDED and MASKED case.
+   if(domain_type == 'Simple' .or. domain_type == 'Simple symmetry' .or. domain_type == 'Cyclic' .or. domain_type == 'Cyclic symmetry') then
+      deallocate(xr4,x1r4,x2r4,x3r4,x4r4)
+      return
+   end if
+
+   !------------------------------------------------------------------
+   !              vector update : BGRID_NE
+   !------------------------------------------------------------------
+   shift = 0
+   if (is_symmetry) then
+     shift = 1
+     deallocate(globalr4)
+     allocate(globalr4(1-whalo:nx+ehalo+shift,1-shalo:ny+nhalo+shift,nz) )
+     globalr4 = 0.0
+     do k = 1,nz
+       do j = 1,ny+1
+         do i = 1,nx+1
+           globalr4(i,j,k) = k + i*1e-3 + j*1e-6
+         end do
+       end do
+     end do
+     if (domain_type == 'Masked symmetry') then
+       globalr4(nx-nx/layout(1)+1:nx+1,ny-ny/layout(2)+1:ny+1,:) = 0
+     endif
+     deallocate(xr4, x1r4, x2r4, x3r4, x4r4)
+     allocate( xr4(isd:ied+1,jsd:jed+1,nz) )
+     allocate( x1r4(isd:ied+1,jsd:jed+1,nz) )
+     allocate( x2r4(isd:ied+1,jsd:jed+1,nz) )
+     allocate( x3r4(isd:ied+1,jsd:jed+1,nz) )
+     allocate( x4r4(isd:ied+1,jsd:jed+1,nz) )
+   endif
+
+   folded_south = .false.
+   folded_west  = .false.
+   folded_east  = .false.
+   select case (domain_type)
+   case ('Folded-north', 'Masked')
+     ! fill in folded north edge, cyclic east and west edge
+     call fill_folded_north_halo(globalr4, 1, 1, 0, 0, -1)
+   case ('Folded xy_halo')
+     ! fill in folded north edge, cyclic east and west edge
+     globalr4(1-xhalo:0,                  1:ny,:) =  globalr4(nx-xhalo+1:nx,                     1:ny,:)
+     globalr4(nx+1:nx+xhalo,              1:ny,:) =  globalr4(1:xhalo,                           1:ny,:)
+     globalr4(1-xhalo:nx+xhalo-1,ny+1:ny+yhalo,:) = -globalr4(nx+xhalo-1:1-xhalo:-1,ny-1:ny-yhalo:-1,:)
+     globalr4(nx+xhalo,          ny+1:ny+yhalo,:) = -globalr4(nx-xhalo,             ny-1:ny-yhalo:-1,:)
+   case ('Folded-north symmetry', 'Masked symmetry' )
+     call fill_folded_north_halo(globalr4, 1, 1, 1, 1, -1)
+   case ('Folded-south symmetry' )
+     folded_south = .true.
+     call fill_folded_south_halo(globalr4, 1, 1, 1, 1, -1)
+   case ('Folded-west symmetry' )
+     folded_west = .true.
+     call fill_folded_west_halo(globalr4, 1, 1, 1, 1, -1)
+   case ('Folded-east symmetry' )
+     folded_east = .true.
+     call fill_folded_east_halo(globalr4, 1, 1, 1, 1, -1)
+   case default
+     call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//domain_type )
+   end select
+
+   xr4 = 0.0
+   xr4(is:ie+shift,js:je+shift,:) = globalr4(is:ie+shift,js:je+shift,:)
+   !set up yr4 array
+   allocate( yr4 (isd:ied+shift,jsd:jed+shift,nz) )
+   allocate( y1r4(isd:ied+shift,jsd:jed+shift,nz) )
+   allocate( y2r4(isd:ied+shift,jsd:jed+shift,nz) )
+   allocate( y3r4(isd:ied+shift,jsd:jed+shift,nz) )
+   allocate( y4r4(isd:ied+shift,jsd:jed+shift,nz) )
+   yr4 = xr4; x1r4 = xr4; x2r4 = xr4; x3r4 = xr4; x4r4 = xr4
+   yr4 = xr4; y1r4 = xr4; y2r4 = xr4; y3r4 = xr4; y4r4 = xr4
+
+   id = mpp_clock_id( domain_type//' vector BGRID_NE', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+   call mpp_clock_begin(id)
+   call mpp_update_domains( xr4,  yr4,  domain, gridtype=BGRID_NE)
+   call mpp_update_domains( x1r4, y1r4, domain, gridtype=BGRID_NE, complete=.false. )
+   call mpp_update_domains( x2r4, y2r4, domain, gridtype=BGRID_NE, complete=.false. )
+   call mpp_update_domains( x3r4, y3r4, domain, gridtype=BGRID_NE, complete=.false. )
+   call mpp_update_domains( x4r4, y4r4, domain, gridtype=BGRID_NE, complete=.true.  )
+   call mpp_clock_end  (id)
+
+   ! redundant points must be equal and opposite
+   if(folded_south) then
+     globalr4(nx/2+shift,                1,:) = 0.  !pole points must have 0 velocity
+     globalr4(nx+shift  ,                1,:) = 0.  !pole points must have 0 velocity
+     globalr4(nx/2+1+shift:nx-1+shift,   1,:) = -globalr4(nx/2-1+shift:1+shift:-1, 1,:)
+     globalr4(1-whalo:shift,             1,:) = -globalr4(nx-whalo+1:nx+shift,     1,:)
+     globalr4(nx+1+shift:nx+ehalo+shift, 1,:) = -globalr4(1+shift:ehalo+shift,     1,:)
+
+     if(shalo >0) globalr4(shift,1,:) = 0.0  !pole points must have 0 velocity
+   else if(folded_west) then
+     globalr4(1, ny/2+shift, :) = 0.0 !pole points must have 0 velocity
+     globalr4(1, ny+shift,   :) = 0.0 !pole points must have 0 velocity
+     globalr4(1, ny/2+1+shift:ny-1+shift,   :) = -globalr4(1, ny/2-1+shift:1+shift:-1, :)
+     globalr4(1, 1-shalo:shift,             :) = -globalr4(1, ny-shalo+1:ny+shift,     :)
+     globalr4(1, ny+1+shift:ny+nhalo+shift, :) = -globalr4(1, 1+shift:nhalo+shift,     :)
+     !--- the following will fix the +0/-0 problem on altix
+     if(whalo>0) globalr4(1, shift, :) = 0.0  !pole points must have 0 velocity
+   else if(folded_east) then
+     globalr4(nx+shift, ny/2+shift, :) = 0.0 !pole points must have 0 velocity
+     globalr4(nx+shift, ny+shift,   :) = 0.0 !pole points must have 0 velocity
+     globalr4(nx+shift, ny/2+1+shift:ny-1+shift,   :) = -globalr4(nx+shift, ny/2-1+shift:1+shift:-1, :)
+     globalr4(nx+shift, 1-shalo:shift,             :) = -globalr4(nx+shift, ny-shalo+1:ny+shift,     :)
+     globalr4(nx+shift, ny+1+shift:ny+nhalo+shift, :) = -globalr4(nx+shift, 1+shift:nhalo+shift,     :)
+     if(ehalo >0) globalr4(nx+shift, shift, :) = 0.0  !pole points must have 0 velocity
+   else
+      globalr4(nx/2+shift,                ny+shift,:) = 0.0  !pole points must have 0 velocity
+      globalr4(nx+shift  ,                ny+shift,:) = 0.0  !pole points must have 0 velocity
+      globalr4(nx/2+1+shift:nx-1+shift,   ny+shift,:) = -globalr4(nx/2-1+shift:1+shift:-1, ny+shift,:)
+      if(domain_type == 'Folded xy_halo') then
+        globalr4(1-xhalo:shift,             ny+shift,:) = -globalr4(nx-xhalo+1:nx+shift,     ny+shift,:)
+        globalr4(nx+1+shift:nx+xhalo+shift, ny+shift,:) = -globalr4(1+shift:xhalo+shift,     ny+shift,:)
+      else
+        globalr4(1-whalo:shift,             ny+shift,:) = -globalr4(nx-whalo+1:nx+shift,     ny+shift,:)
+        globalr4(nx+1+shift:nx+ehalo+shift, ny+shift,:) = -globalr4(1+shift:ehalo+shift,     ny+shift,:)
+      end if
+      !--- the following will fix the +0/-0 problem on altix
+      if(nhalo >0) globalr4(shift,ny+shift,:) = 0.0  !pole points must have 0 velocity
+   endif
+
+   call compare_checksums( xr4,  globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE xr4' )
+   call compare_checksums( yr4,  globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE yr4' )
+   call compare_checksums( x1r4, globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE x1r4' )
+   call compare_checksums( x2r4, globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE x2r4' )
+   call compare_checksums( x3r4, globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE x3r4' )
+   call compare_checksums( x4r4, globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE x4r4' )
+   call compare_checksums( y1r4, globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE y1r4' )
+   call compare_checksums( y2r4, globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE y2r4' )
+   call compare_checksums( y3r4, globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE y3r4' )
+   call compare_checksums( y4r4, globalr4(isd:ied+shift,jsd:jed+shift,:), domain_type//' BGRID_NE y4r4' )
+
+   deallocate(globalr4, xr4, x1r4, x2r4, x3r4, x4r4, yr4, y1r4, y2r4, y3r4, y4r4)
+   !------------------------------------------------------------------
+   !              vector update : CGRID_NE
+   !------------------------------------------------------------------
+   !--- global1r4 is x-component and global2r4 is y-component
+   if(domain_type == 'Folded xy_halo') then
+     allocate(global1r4(1-xhalo:nx+xhalo, 1-yhalo:ny+yhalo, nz))
+     allocate(global2r4(1-xhalo:nx+xhalo, 1-yhalo:ny+yhalo, nz))
+   else
+     allocate(global1r4(1-whalo:nx+ehalo+shift, 1-shalo:ny+nhalo, nz))
+     allocate(global2r4(1-whalo:nx+ehalo, 1-shalo:ny+nhalo+shift, nz))
+   end if
+   allocate(xr4 (isd:ied+shift,jsd:jed,nz), yr4 (isd:ied,jsd:jed+shift,nz) )
+   allocate(x1r4(isd:ied+shift,jsd:jed,nz), y1r4(isd:ied,jsd:jed+shift,nz) )
+   allocate(x2r4(isd:ied+shift,jsd:jed,nz), y2r4(isd:ied,jsd:jed+shift,nz) )
+   allocate(x3r4(isd:ied+shift,jsd:jed,nz), y3r4(isd:ied,jsd:jed+shift,nz) )
+   allocate(x4r4(isd:ied+shift,jsd:jed,nz), y4r4(isd:ied,jsd:jed+shift,nz) )
+
+   global1r4 = 0.0
+   global2r4 = 0.0
+   do k = 1,nz
+     do j = 1,ny
+       do i = 1,nx+shift
+         global1r4(i,j,k) = k + i*1e-3 + j*1e-6
+       end do
+     end do
+     do j = 1,ny+shift
+       do i = 1,nx
+         global2r4(i,j,k) = k + i*1e-3 + j*1e-6
+       end do
+     end do
+   end do
+
+   if (domain_type == 'Masked' .or. domain_type == 'Masked symmetry') then
+     global1r4(nx-nx/layout(1)+1:nx+shift,ny-ny/layout(2)+1:ny,:) = 0
+     global2r4(nx-nx/layout(1)+1:nx,ny-ny/layout(2)+1:ny+shift,:) = 0
+   end if
+
+   select case (domain_type)
+   case ('Folded-north', 'Masked')
+     !fill in folded north edge, cyclic east and west edge
+     call fill_folded_north_halo(global1r4, 1, 0, 0, 0, -1)
+     call fill_folded_north_halo(global2r4, 0, 1, 0, 0, -1)
+   case ('Folded xy_halo')
+     global1r4(1-xhalo:0,                   1:ny,:) =  global1r4(nx-xhalo+1:nx,                     1:ny,:)
+     global1r4(nx+1:nx+xhalo,               1:ny,:) =  global1r4(1:xhalo,                           1:ny,:)
+     global2r4(1-xhalo:0,                   1:ny,:) =  global2r4(nx-xhalo+1:nx,                     1:ny,:)
+     global2r4(nx+1:nx+xhalo,               1:ny,:) =  global2r4(1:xhalo,                           1:ny,:)
+     global1r4(1-xhalo:nx+xhalo-1, ny+1:ny+yhalo,:) = -global1r4(nx+xhalo-1:1-xhalo:-1, ny:ny-yhalo+1:-1,:)
+     global1r4(nx+xhalo,           ny+1:ny+yhalo,:) = -global1r4(nx-xhalo,              ny:ny-yhalo+1:-1,:)
+     global2r4(1-xhalo:nx+xhalo,   ny+1:ny+yhalo,:) = -global2r4(nx+xhalo:1-xhalo:-1,   ny-1:ny-yhalo:-1,:)
+   case ('Folded-north symmetry')
+     call fill_folded_north_halo(global1r4, 1, 0, 1, 0, -1)
+     call fill_folded_north_halo(global2r4, 0, 1, 0, 1, -1)
+   case ('Folded-south symmetry')
+     call fill_folded_south_halo(global1r4, 1, 0, 1, 0, -1)
+     call fill_folded_south_halo(global2r4, 0, 1, 0, 1, -1)
+   case ('Folded-west symmetry')
+     call fill_folded_west_halo(global1r4, 1, 0, 1, 0, -1)
+     call fill_folded_west_halo(global2r4, 0, 1, 0, 1, -1)
+   case ('Folded-east symmetry')
+     call fill_folded_east_halo(global1r4, 1, 0, 1, 0, -1)
+     call fill_folded_east_halo(global2r4, 0, 1, 0, 1, -1)
+   case default
+     call mpp_error( FATAL, 'TEST_MPP_DOMAINS: no such test: '//domain_type )
+   end select
+
+   xr4 = 0.0; yr4 = 0.0
+   xr4(is:ie+shift,js:je,      :) = global1r4(is:ie+shift,js:je,      :)
+   yr4(is:ie      ,js:je+shift,:) = global2r4(is:ie,      js:je+shift,:)
+   x1r4 = xr4; x2r4 = xr4; x3r4 = xr4; x4r4 = xr4
+   y1r4 = yr4; y2r4 = yr4; y3r4 = yr4; y4r4 = yr4
+
+   id = mpp_clock_id( domain_type//' vector CGRID_NE', flags=MPP_CLOCK_SYNC+MPP_CLOCK_DETAILED )
+   call mpp_clock_begin(id)
+   call mpp_update_domains( xr4,  yr4,  domain, gridtype=CGRID_NE)
+   call mpp_update_domains( x1r4, y1r4, domain, gridtype=CGRID_NE, complete=.false. )
+   call mpp_update_domains( x2r4, y2r4, domain, gridtype=CGRID_NE, complete=.false. )
+   call mpp_update_domains( x3r4, y3r4, domain, gridtype=CGRID_NE, complete=.false. )
+   call mpp_update_domains( x4r4, y4r4, domain, gridtype=CGRID_NE, complete=.true.  )
+   call mpp_clock_end  (id)
+
+   ! redundant points must be equal and opposite
+   if(folded_south) then
+     global2r4(nx/2+1:nx,     1,:) = -global2r4(nx/2:1:-1, 1,:)
+     global2r4(1-whalo:0,     1,:) = -global2r4(nx-whalo+1:nx, 1, :)
+     global2r4(nx+1:nx+ehalo, 1,:) = -global2r4(1:ehalo,       1, :)
+   else if(folded_west) then
+     global1r4(1, ny/2+1:ny,     :) = -global1r4(1, ny/2:1:-1,     :)
+     global1r4(1, 1-shalo:0,     :) = -global1r4(1, ny-shalo+1:ny, :)
+     global1r4(1, ny+1:ny+nhalo, :) = -global1r4(1, 1:nhalo,       :)
+   else if(folded_east) then
+     global1r4(nx+shift, ny/2+1:ny,     :) = -global1r4(nx+shift, ny/2:1:-1,     :)
+     global1r4(nx+shift, 1-shalo:0,     :) = -global1r4(nx+shift, ny-shalo+1:ny, :)
+     global1r4(nx+shift, ny+1:ny+nhalo, :) = -global1r4(nx+shift, 1:nhalo,       :)
+   else
+     global2r4(nx/2+1:nx,     ny+shift,:) = -global2r4(nx/2:1:-1, ny+shift,:)
+     if (domain_type == 'Folded xy_halo') then
+       global2r4(1-xhalo:0,     ny+shift,:) = -global2r4(nx-xhalo+1:nx, ny+shift,:)
+       global2r4(nx+1:nx+xhalo, ny+shift,:) = -global2r4(1:xhalo,       ny+shift,:)
+     else
+       global2r4(1-whalo:0,     ny+shift,:) = -global2r4(nx-whalo+1:nx, ny+shift,:)
+       global2r4(nx+1:nx+ehalo, ny+shift,:) = -global2r4(1:ehalo,       ny+shift,:)
+     end if
+   endif
+
+   call compare_checksums( xr4,  global1r4(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE xr4' )
+   call compare_checksums( yr4,  global2r4(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE yr4' )
+   call compare_checksums( x1r4, global1r4(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE x1r4' )
+   call compare_checksums( x2r4, global1r4(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE x2r4' )
+   call compare_checksums( x3r4, global1r4(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE x3r4' )
+   call compare_checksums( x4r4, global1r4(isd:ied+shift,jsd:jed,      :), domain_type//' CGRID_NE x4r4' )
+   call compare_checksums( y1r4, global2r4(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE y1r4' )
+   call compare_checksums( y2r4, global2r4(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE y2r4' )
+   call compare_checksums( y3r4, global2r4(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE y3r4' )
+   call compare_checksums( y4r4, global2r4(isd:ied,      jsd:jed+shift,:), domain_type//' CGRID_NE y4r4' )
+
+   deallocate(global1r4, global2r4, xr4, x1r4, x2r4, x3r4, x4r4, yr4, y1r4, y2r4, y3r4, y4r4)
+
+ end subroutine test_halo_update_r4
+
+ !> test a domain update of a 64-bit 3D array on a 9-pe subset of total allotted pes
+ !> @note requires at least 16 pes
+ subroutine test_subset_update_r8( )
+   real(kind=r8_kind), allocatable, dimension(:,:,:) :: x
+   type(domain2D) :: domain
+   real(kind=r8_kind), allocatable :: global(:,:,:)
+   integer              :: i, xhalo, yhalo
+   integer              :: is, ie, js, je, isd, ied, jsd, jed
+   integer :: pes9(9)=(/1,2,4,6,8,10,12,13,15/)
+   integer :: ni, nj
+   integer :: pe, npes
+
+   pe = mpp_pe()
+   npes = mpp_npes()
+
+   call mpp_declare_pelist(pes9)
+   if(any(mpp_pe()==pes9)) then
+     call mpp_set_current_pelist(pes9)
+     layout = (/3,3/)
+     ni = 3; nj =3
+     call mpp_define_domains((/1,ni,1,nj/), layout, domain, xhalo=1 &
+       &, yhalo=1, xflags=CYCLIC_GLOBAL_DOMAIN, yflags&
+       &=CYCLIC_GLOBAL_DOMAIN, name='subset domain')
+     call mpp_get_compute_domain(domain, is, ie, js, je)
+     print*, "pe=", mpp_pe(), is, ie, js, je
+
+     allocate(global(0:ni+1,0:nj+1,nz) )
+
+     global = 0
+     do k = 1,nz
+       do j = 1,nj
+         do i = 1,ni
+           global(i,j,k) = k + i*1e-3 + j*1e-6
+         end do
+       end do
+     end do
+
+     global(0,      1:nj,:) = global(ni,     1:nj,:)
+     global(ni+1,   1:nj,:) = global(1,      1:nj,:)
+     global(0:ni+1, 0,   :) = global(0:ni+1, nj,  :)
+     global(0:ni+1, nj+1,:) = global(0:ni+1, 1,   :)
+
+     ! set up x array
+     call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+     call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+     allocate( x (isd:ied,jsd:jed,nz) )
+
+     x(:,:,:) = 0.0
+     x (is:ie,js:je,:) = global(is:ie,js:je,:)
+
+     ! full update
+     call mpp_update_domains(x, domain)
+     call compare_checksums(x, global(isd:ied,jsd:jed,:), '64-bit array 9 pe subset')
+
+     deallocate(x, global)
+     call mpp_deallocate_domain(domain)
+   endif
+
+   call mpp_set_current_pelist()
+
+  end subroutine test_subset_update_r8
+
+ !> test a domain update of a 32-bit 3D array on a 9-pe subset of total allotted pes
+ !> @note requires at least 16 pes
+ subroutine test_subset_update_r4( )
+   real(kind=r4_kind), allocatable, dimension(:,:,:) :: x
+   type(domain2D) :: domain
+   real(kind=r4_kind), allocatable :: global(:,:,:)
+   integer              :: i, xhalo, yhalo
+   integer              :: is, ie, js, je, isd, ied, jsd, jed
+   integer :: pes9(9)=(/1,2,4,6,8,10,12,13,15/)
+   integer :: ni, nj
+   integer :: pe, npes
+
+   pe = mpp_pe()
+   npes = mpp_npes()
+
+   call mpp_declare_pelist(pes9)
+   if(any(mpp_pe()==pes9)) then
+     call mpp_set_current_pelist(pes9)
+     layout = (/3,3/)
+     ni = 3; nj =3
+     call mpp_define_domains((/1,ni,1,nj/), layout, domain, xhalo=1 &
+       &, yhalo=1, xflags=CYCLIC_GLOBAL_DOMAIN, yflags&
+       &=CYCLIC_GLOBAL_DOMAIN, name='subset domain')
+     call mpp_get_compute_domain(domain, is, ie, js, je)
+     print*, "pe=", mpp_pe(), is, ie, js, je
+
+     allocate(global(0:ni+1,0:nj+1,nz) )
+
+     global = 0
+     do k = 1,nz
+       do j = 1,nj
+         do i = 1,ni
+           global(i,j,k) = k + i*1e-3 + j*1e-6
+         end do
+       end do
+     end do
+
+     global(0,      1:nj,:) = global(ni,     1:nj,:)
+     global(ni+1,   1:nj,:) = global(1,      1:nj,:)
+     global(0:ni+1, 0,   :) = global(0:ni+1, nj,  :)
+     global(0:ni+1, nj+1,:) = global(0:ni+1, 1,   :)
+
+     ! set up x array
+     call mpp_get_compute_domain( domain, is,  ie,  js,  je  )
+     call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+     allocate( x (isd:ied,jsd:jed,nz) )
+
+     x(:,:,:) = 0.0
+     x (is:ie,js:je,:) = global(is:ie,js:je,:)
+
+     ! full update
+     call mpp_update_domains(x, domain)
+     call compare_checksums(x, global(isd:ied,jsd:jed,:), '32-bit array on 9 pe subset')
+
+     deallocate(x, global)
+     call mpp_deallocate_domain(domain)
+   endif
+
+   call mpp_set_current_pelist()
+
+  end subroutine test_subset_update_r4
+
+end module test_mpp_update_domains_real
diff --git a/test_fms/mpp/test_numb.nml b/test_fms/mpp/test_numb.nml
new file mode 100644
index 0000000000..438b638508
--- /dev/null
+++ b/test_fms/mpp/test_numb.nml
@@ -0,0 +1,3 @@
+&test_read_input_nml_nml
+test_numb = 4
+/
diff --git a/test_fms/mpp/test_numb2.nml b/test_fms/mpp/test_numb2.nml
new file mode 100644
index 0000000000..f7c91a1f7a
--- /dev/null
+++ b/test_fms/mpp/test_numb2.nml
@@ -0,0 +1,3 @@
+&test_mpp_get_ascii_lines_nml
+test_number = 5
+/
diff --git a/test_fms/mpp/test_numb_ascii.nml b/test_fms/mpp/test_numb_ascii.nml
new file mode 100644
index 0000000000..198d57b24b
--- /dev/null
+++ b/test_fms/mpp/test_numb_ascii.nml
@@ -0,0 +1,3 @@
+&test_read_ascii_file_nml
+test_numb = 8
+/
diff --git a/test_fms/mpp/test_numb_base.nml b/test_fms/mpp/test_numb_base.nml
new file mode 100644
index 0000000000..86d5657d7c
--- /dev/null
+++ b/test_fms/mpp/test_numb_base.nml
@@ -0,0 +1,3 @@
+&test_read_input_nml_nml
+test_numb = 0
+/
diff --git a/test_fms/mpp/test_numb_base2.nml b/test_fms/mpp/test_numb_base2.nml
new file mode 100644
index 0000000000..6b311f1002
--- /dev/null
+++ b/test_fms/mpp/test_numb_base2.nml
@@ -0,0 +1,3 @@
+&test_mpp_get_ascii_lines_nml
+test_number = 0
+/
diff --git a/test_fms/mpp/test_numb_base_ascii.nml b/test_fms/mpp/test_numb_base_ascii.nml
new file mode 100644
index 0000000000..b1063b0677
--- /dev/null
+++ b/test_fms/mpp/test_numb_base_ascii.nml
@@ -0,0 +1,3 @@
+&test_read_ascii_file_nml
+test_numb = 0
+/
diff --git a/test_fms/mpp/test_peset b/test_fms/mpp/test_peset
new file mode 100755
index 0000000000..59b2234e4a
--- /dev/null
+++ b/test_fms/mpp/test_peset
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_peset - temporary wrapper script for .libs/test_peset
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_peset program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_peset:test_peset:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_peset:test_peset:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_peset:test_peset:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_peset'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_read_ascii_file b/test_fms/mpp/test_read_ascii_file
new file mode 100755
index 0000000000..2baae338f3
--- /dev/null
+++ b/test_fms/mpp/test_read_ascii_file
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_read_ascii_file - temporary wrapper script for .libs/test_read_ascii_file
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_read_ascii_file program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_read_ascii_file:test_read_ascii_file:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_read_ascii_file:test_read_ascii_file:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_read_ascii_file:test_read_ascii_file:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_read_ascii_file'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_read_input_nml b/test_fms/mpp/test_read_input_nml
new file mode 100755
index 0000000000..202f4ec7f2
--- /dev/null
+++ b/test_fms/mpp/test_read_input_nml
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_read_input_nml - temporary wrapper script for .libs/test_read_input_nml
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_read_input_nml program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_read_input_nml:test_read_input_nml:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_read_input_nml:test_read_input_nml:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_read_input_nml:test_read_input_nml:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_read_input_nml'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_redistribute_int.F90 b/test_fms/mpp/test_redistribute_int.F90
new file mode 100644
index 0000000000..753b60b96b
--- /dev/null
+++ b/test_fms/mpp/test_redistribute_int.F90
@@ -0,0 +1,760 @@
+!***********************************************************************
+!                   GNU Lesser General Public License
+!
+! This file is part of the GFDL Flexible Modeling System (FMS).
+!
+! FMS is free software: you can redistribute it and/or modify it under
+! the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation, either version 3 of the License, or (at
+! your option) any later version.
+!
+! FMS is distributed in the hope that it will be useful, but WITHOUT
+! ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+! FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+! for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+
+!> @author Ryan Mulhall
+!> @email gfdl.climate.model.info@noaa.gov
+!> @description Unit test for mpp_redistribute with both sized integers
+!> also tests redistribute with mosaic cubic grid
+program test_mpp_redistribute
+
+  use mpp_mod,         only : FATAL, WARNING, NOTE, MPP_INIT_TEST_INIT_TRUE_ONLY
+  use mpp_mod,         only : mpp_pe, mpp_npes, mpp_error
+  use mpp_mod,         only : mpp_declare_pelist, mpp_set_current_pelist, mpp_sync, mpp_sync_self
+  use mpp_mod,         only : mpp_init, mpp_exit, stdout, stderr
+  use mpp_domains_mod, only : domain1D, domain2D, DomainCommunicator2D, mpp_global_field
+  use mpp_domains_mod, only : mpp_get_compute_domain, mpp_get_data_domain, mpp_domains_set_stack_size
+  use mpp_domains_mod, only : mpp_domains_init, mpp_domains_exit, mpp_broadcast_domain
+  use mpp_domains_mod, only : mpp_update_domains, mpp_check_field, mpp_redistribute, mpp_get_memory_domain
+  use mpp_domains_mod, only : mpp_define_layout, mpp_define_domains, mpp_deallocate_domain
+  use mpp_domains_mod, only : mpp_define_mosaic, mpp_nullify_domain_list
+  use platform_mod
+
+  implicit none
+  integer :: pe, npes, ierr
+  integer :: nx=128, ny=128, nz=40, stackmax=4000000
+
+  call mpp_init(MPP_INIT_TEST_INIT_TRUE_ONLY)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  call mpp_domains_init()
+  call mpp_domains_set_stack_size(stackmax)
+
+  call mpp_error(NOTE, "----------Starting tests----------")
+  call test_redistribute_i4()
+  call mpp_error(NOTE, "test_mpp_redistribute: 32-bit integer test passed")
+  call test_redistribute_i8()
+  call mpp_error(NOTE, "test_mpp_redistribute: 64-bit integer test passed")
+  call cubic_grid_redistribute_i4()
+  call mpp_error(NOTE, "test_mpp_redistribute: 32-bit integer cubic grid test passed")
+  call cubic_grid_redistribute_i8()
+  call mpp_error(NOTE, "test_mpp_redistribute: 64-bit integer cubic grid test passed")
+  call mpp_error(NOTE, "----------Tests Complete----------")
+
+  call mpp_domains_exit()
+  call mpi_finalize(ierr)
+
+contains
+
+  !> redistribute x domain to y with 32-bit integers
+  subroutine test_redistribute_i4()
+    type(domain2D) :: domainx, domainy
+    type(DomainCommunicator2D), pointer, save :: dch =>NULL()
+    integer(i4_kind), allocatable, dimension(:,:,:)       :: gcheck, glbl
+    integer(i4_kind), allocatable, dimension(:,:,:) :: x, x1, x2, x3, x4, x5, x6
+    integer(i4_kind), allocatable, dimension(:,:,:) :: y, y1, y2, y3, y4, y5, y6
+    integer                                      :: k, j, i, layout(2), id
+    integer                                      :: is, ie, js, je, isd, ied, jsd, jed
+    ! nullify domain list otherwise it retains memory between calls.
+    call mpp_nullify_domain_list(domainx)
+    call mpp_nullify_domain_list(domainy)
+
+    !fill in glbl array with kiiijjj
+    allocate( gcheck(nx,ny,nz), glbl(nx,ny,nz) )
+    do k = 1,nz
+      do j = 1,ny
+        do i = 1,nx
+          glbl(i,j,k) = k*1e6 + i*1e3 + j
+        end do
+      end do
+    end do
+
+    ! set up x arrays
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    call mpp_define_domains( (/1,nx,1,ny/), layout, domainx )
+    call mpp_get_compute_domain( domainx, is,  ie,  js,  je  )
+    call mpp_get_data_domain   ( domainx, isd, ied, jsd, jed )
+    allocate( x(isd:ied,jsd:jed,nz) )
+    allocate( x2(isd:ied,jsd:jed,nz) )
+    allocate( x3(isd:ied,jsd:jed,nz) )
+    allocate( x4(isd:ied,jsd:jed,nz) )
+    allocate( x5(isd:ied,jsd:jed,nz) )
+    allocate( x6(isd:ied,jsd:jed,nz) )
+    x = 0.
+    x(is:ie,js:je,:) = glbl(is:ie,js:je,:)
+    x2 = x;  x3 = x; x4 = x; x5 = x; x6 = x
+
+    !set up y arrays
+    call mpp_define_domains( (/1,nx,1,ny/), (/npes,1/), domainy)
+    call mpp_get_data_domain   ( domainy, isd, ied, jsd, jed )
+    call mpp_get_compute_domain( domainy, is,  ie,  js,  je  )
+    allocate( y(isd:ied,jsd:jed,nz) )
+    allocate( y2(isd:ied,jsd:jed,nz) )
+    allocate( y3(isd:ied,jsd:jed,nz) )
+    allocate( y4(isd:ied,jsd:jed,nz) )
+    allocate( y5(isd:ied,jsd:jed,nz) )
+    allocate( y6(isd:ied,jsd:jed,nz) )
+    y = 0.
+    y2 = 0.;y3 = 0.;y4 = 0.;y5 = 0.;y6 = 0.
+
+    !go global and redistribute
+    call mpp_broadcast_domain(domainx)
+    call mpp_broadcast_domain(domainy)
+    call mpp_redistribute( domainx, x, domainy, y )
+
+    !check answers on pelist
+    call mpp_global_field( domainy, y(:,:,:), gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL , &
+             "test_mpp_redistribute: incorrect results in global array")
+    ! redistribute and check x answers
+    if(ALLOCATED(y))y=0.
+    call mpp_redistribute( domainx, x,  domainy, y,  complete=.false. )
+    call mpp_redistribute( domainx, x2, domainy, y2, complete=.false. )
+    call mpp_redistribute( domainx, x3, domainy, y3, complete=.false. )
+    call mpp_redistribute( domainx, x4, domainy, y4, complete=.false. )
+    call mpp_redistribute( domainx, x5, domainy, y5, complete=.false. )
+    call mpp_redistribute( domainx, x6, domainy, y6, complete=.true., dc_handle=dch )
+    call mpp_global_field( domainx, x(:,:,:), gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x")
+    call mpp_global_field( domainx, x2, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x2")
+    call mpp_global_field( domainx, x3, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x3")
+    call mpp_global_field( domainx, x4, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x4")
+    call mpp_global_field( domainx, x5, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+             "test_mpp_redistribute: global array differs for x5")
+    call mpp_global_field( domainx, x6, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x6")
+
+    ! redistribute and check y answers
+    if(ALLOCATED(y))then
+      y=0.; y2=0.; y3=0.; y4=0.; y5=0.; y6=0.
+    endif
+    call mpp_redistribute( domainx, x, domainy, y, complete=.false. )
+    call mpp_redistribute( domainx, x2, domainy, y2, complete=.false. )
+    call mpp_redistribute( domainx, x3, domainy, y3, complete=.false. )
+    call mpp_redistribute( domainx, x4, domainy, y4, complete=.false. )
+    call mpp_redistribute( domainx, x5, domainy, y5, complete=.false. )
+    call mpp_redistribute( domainx, x6, domainy, y6, complete=.true., dc_handle=dch )
+    call mpp_global_field( domainy, y, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y")
+    call mpp_global_field( domainy, y2, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y2")
+    call mpp_global_field( domainy, y3, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y3")
+    call mpp_global_field( domainy, y4, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y4")
+    call mpp_global_field( domainy, y5, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y5")
+    call mpp_global_field( domainy, y6, gcheck )
+    if(.not. compare_result4( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y6")
+
+    dch =>NULL()
+
+    call mpp_redistribute( domainx, x, domainy, y, free=.true.,list_size=6 )
+    deallocate(gcheck, glbl)
+    deallocate(x,x2,x3,x4,x5,x6)
+    deallocate(y,y2,y3,y4,y5,y6)
+
+  end subroutine test_redistribute_i4
+
+  !> Test redistribute between two domains with 64-bit integers
+  subroutine test_redistribute_i8()
+    type(domain2D) :: domainx, domainy
+    type(DomainCommunicator2D), pointer, save :: dch =>NULL()
+    integer(i8_kind), allocatable, dimension(:,:,:)       :: gcheck, glbl
+    integer(i8_kind), allocatable, dimension(:,:,:), save :: x, x1, x2, x3, x4, x5, x6
+    integer(i8_kind), allocatable, dimension(:,:,:), save :: y, y1, y2, y3, y4, y5, y6
+    integer                                      :: k, j, i, layout(2), id
+    integer                                      :: is, ie, js, je, isd, ied, jsd, jed
+    ! nullify domain list otherwise it retains memory between calls.
+    call mpp_nullify_domain_list(domainx)
+    call mpp_nullify_domain_list(domainy)
+
+    !fill in glbl array with kiiijjj
+    allocate( gcheck(nx,ny,nz), glbl(nx,ny,nz) )
+    do k = 1,nz
+      do j = 1,ny
+        do i = 1,nx
+          glbl(i,j,k) = k*1e6 + i*1e3 + j
+        end do
+      end do
+    end do
+
+    ! set up x arrays
+    call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+    call mpp_define_domains( (/1,nx,1,ny/), layout, domainx )
+    call mpp_get_compute_domain( domainx, is,  ie,  js,  je  )
+    call mpp_get_data_domain   ( domainx, isd, ied, jsd, jed )
+    allocate( x(isd:ied,jsd:jed,nz) )
+    allocate( x2(isd:ied,jsd:jed,nz) )
+    allocate( x3(isd:ied,jsd:jed,nz) )
+    allocate( x4(isd:ied,jsd:jed,nz) )
+    allocate( x5(isd:ied,jsd:jed,nz) )
+    allocate( x6(isd:ied,jsd:jed,nz) )
+    x = 0.
+    x(is:ie,js:je,:) = glbl(is:ie,js:je,:)
+    x2 = x;  x3 = x; x4 = x; x5 = x; x6 = x
+
+    !set up y arrays
+    call mpp_define_domains( (/1,nx,1,ny/), (/npes,1/), domainy)
+    call mpp_get_data_domain   ( domainy, isd, ied, jsd, jed )
+    call mpp_get_compute_domain( domainy, is,  ie,  js,  je  )
+    allocate( y(isd:ied,jsd:jed,nz) )
+    allocate( y2(isd:ied,jsd:jed,nz) )
+    allocate( y3(isd:ied,jsd:jed,nz) )
+    allocate( y4(isd:ied,jsd:jed,nz) )
+    allocate( y5(isd:ied,jsd:jed,nz) )
+    allocate( y6(isd:ied,jsd:jed,nz) )
+    y = 0.
+    y2 = 0.;y3 = 0.;y4 = 0.;y5 = 0.;y6 = 0.
+
+    !go global and redistribute
+    call mpp_broadcast_domain(domainx)
+    call mpp_broadcast_domain(domainy)
+    call mpp_redistribute( domainx, x, domainy, y )
+
+    !check answers on pelist
+    call mpp_global_field( domainy, y(:,:,:), gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL , &
+             "test_mpp_redistribute: incorrect results in global array")
+    ! redistribute and check x answers
+    if(ALLOCATED(y))y=0.
+    call mpp_redistribute( domainx, x,  domainy, y,  complete=.false. )
+    call mpp_redistribute( domainx, x2, domainy, y2, complete=.false. )
+    call mpp_redistribute( domainx, x3, domainy, y3, complete=.false. )
+    call mpp_redistribute( domainx, x4, domainy, y4, complete=.false. )
+    call mpp_redistribute( domainx, x5, domainy, y5, complete=.false. )
+    call mpp_redistribute( domainx, x6, domainy, y6, complete=.true., dc_handle=dch )
+    call mpp_global_field( domainx, x(:,:,:), gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x")
+    call mpp_global_field( domainx, x2, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x2")
+    call mpp_global_field( domainx, x3, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x3")
+    call mpp_global_field( domainx, x4, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x4")
+    call mpp_global_field( domainx, x5, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+             "test_mpp_redistribute: global array differs for x5")
+    call mpp_global_field( domainx, x6, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for x6")
+
+    ! redistribute and check y answers
+    if(ALLOCATED(y))then
+      y=0.; y2=0.; y3=0.; y4=0.; y5=0.; y6=0.
+    endif
+    call mpp_redistribute( domainx, x, domainy, y, complete=.false. )
+    call mpp_redistribute( domainx, x2, domainy, y2, complete=.false. )
+    call mpp_redistribute( domainx, x3, domainy, y3, complete=.false. )
+    call mpp_redistribute( domainx, x4, domainy, y4, complete=.false. )
+    call mpp_redistribute( domainx, x5, domainy, y5, complete=.false. )
+    call mpp_redistribute( domainx, x6, domainy, y6, complete=.true., dc_handle=dch )
+    call mpp_global_field( domainy, y, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y")
+    call mpp_global_field( domainy, y2, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y2")
+    call mpp_global_field( domainy, y3, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y3")
+    call mpp_global_field( domainy, y4, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y4")
+    call mpp_global_field( domainy, y5, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y5")
+    call mpp_global_field( domainy, y6, gcheck )
+    if(.not. compare_result8( glbl(1:nx,1:ny,:), gcheck )) call mpp_error(FATAL,&
+            "test_mpp_redistribute: global array differs for y6")
+
+    dch =>NULL()
+
+    call mpp_redistribute( domainx, x, domainy, y, free=.true.,list_size=6 )
+    deallocate(gcheck, glbl)
+    deallocate(x,x2,x3,x4,x5,x6)
+    deallocate(y,y2,y3,y4,y5,y6)
+
+  end subroutine test_redistribute_i8
+  !> Tests redistribute with cubic grid and 32-bit ints
+  subroutine cubic_grid_redistribute_i4
+
+     integer              :: npes, npes_per_ensemble, npes_per_tile
+     integer              :: ensemble_id, tile_id, ensemble_tile_id
+     integer              :: i, j, p, n, ntiles, my_root_pe, k
+     integer              :: isc_ens, iec_ens, jsc_ens, jec_ens
+     integer              :: isd_ens, ied_ens, jsd_ens, jed_ens
+     integer              :: isc, iec, jsc, jec
+     integer              :: isd, ied, jsd, jed, layout_ensemble(2) = (/0,0/)
+     integer, allocatable :: my_ensemble_pelist(:), pe_start(:), pe_end(:)
+     integer, allocatable :: global_indices(:,:), layout2D(:,:)
+     integer(i4_kind),    allocatable :: x(:,:,:,:), x_ens(:,:,:), y(:,:,:)
+     integer              :: layout(2), ensemble_size = 1, layout_cubic(2) = (/0,0/)
+     type(domain2D)       :: domain
+     type(domain2D), allocatable :: domain_ensemble(:)
+     character(len=128)   :: mesg
+     integer              :: nx_cubic = 20, ny_cubic = 20
+
+     logical :: check
+     ! set up pelist
+     npes = mpp_npes()
+     if(mod(npes, ensemble_size) .NE. 0) call mpp_error(FATAL, &
+         "test_mpp_domains: npes is not divisible by ensemble_size")
+     npes_per_ensemble = npes/ensemble_size
+     allocate(my_ensemble_pelist(0:npes_per_ensemble-1))
+     ensemble_id = mpp_pe()/npes_per_ensemble + 1
+     do p = 0, npes_per_ensemble-1
+        my_ensemble_pelist(p) = (ensemble_id-1)*npes_per_ensemble + p
+     enddo
+
+     call mpp_declare_pelist(my_ensemble_pelist)
+
+     ! set tile count
+     ntiles = 6
+
+     if( mod(npes, ntiles) .NE. 0 ) call mpp_error(FATAL, &
+          "test_mpp_domains: npes is not divisible by ntiles")
+
+     npes_per_tile = npes/ntiles
+     tile_id = mpp_pe()/npes_per_tile + 1
+     if( npes_per_tile == layout_cubic(1) * layout_cubic(2) ) then
+        layout = layout_cubic
+     else
+        call mpp_define_layout( (/1,nx_cubic,1,ny_cubic/), npes_per_tile, layout )
+     endif
+     allocate(global_indices(4, ntiles))
+     allocate(layout2D(2, ntiles))
+     allocate(pe_start(ntiles), pe_end(ntiles))
+     do n = 1, ntiles
+       global_indices(:,n) = (/1,nx_cubic,1,ny_cubic/)
+       layout2D(:,n)         = layout
+     end do
+
+     do n = 1, ntiles
+        pe_start(n) = (n-1)*npes_per_tile
+        pe_end(n)   = n*npes_per_tile-1
+     end do
+
+     call define_cubic_mosaic("cubic_grid", domain, (/nx_cubic,nx_cubic,nx_cubic,nx_cubic,nx_cubic,nx_cubic/), &
+                              (/ny_cubic,ny_cubic,ny_cubic,ny_cubic,ny_cubic,ny_cubic/), &
+                                global_indices, layout2D, pe_start, pe_end )
+
+     allocate(domain_ensemble(ensemble_size))
+     !-- define domain for each ensemble
+     call mpp_set_current_pelist( my_ensemble_pelist )
+     if( mod(npes_per_ensemble, ntiles) .NE. 0 ) call mpp_error(FATAL, &
+          "test_mpp_domains: npes_per_ensemble is not divisible by ntiles")
+     npes_per_tile = npes_per_ensemble/ntiles
+     my_root_pe = my_ensemble_pelist(0)
+     ensemble_tile_id = (mpp_pe() - my_root_pe)/npes_per_tile + 1
+
+     if( npes_per_tile == layout_ensemble(1) * layout_ensemble(2) ) then
+        layout = layout_ensemble
+     else
+        call mpp_define_layout( (/1,nx_cubic,1,ny_cubic/), npes_per_tile, layout )
+     endif
+     do n = 1, ntiles
+       global_indices(:,n) = (/1,nx_cubic,1,ny_cubic/)
+       layout2D(:,n)         = layout
+     end do
+
+     do n = 1, ntiles
+        pe_start(n) = my_root_pe + (n-1)*npes_per_tile
+        pe_end(n)   = my_root_pe + n*npes_per_tile-1
+     end do
+
+     call define_cubic_mosaic("cubic_grid",domain_ensemble(ensemble_id),(/nx_cubic,nx_cubic,nx_cubic,nx_cubic,nx_cubic,nx_cubic/)&
+                             ,(/ny_cubic,ny_cubic,ny_cubic,ny_cubic,ny_cubic,ny_cubic/), &
+                                global_indices, layout2D, pe_start, pe_end )
+
+     call mpp_set_current_pelist()
+     do n = 1, ensemble_size
+        call mpp_broadcast_domain(domain_ensemble(n))
+     enddo
+
+     call mpp_get_data_domain( domain_ensemble(ensemble_id), isd_ens, ied_ens, jsd_ens, jed_ens)
+     call mpp_get_compute_domain( domain_ensemble(ensemble_id), isc_ens, iec_ens, jsc_ens, jec_ens)
+     call mpp_get_data_domain( domain, isd, ied, jsd, jed)
+     call mpp_get_compute_domain( domain, isc, iec, jsc, jec)
+
+     allocate(x_ens(isd_ens:ied_ens, jsd_ens:jed_ens, nz))
+     allocate(x(isd:ied, jsd:jed, nz, ensemble_size))
+     allocate(y(isd:ied, jsd:jed, nz))
+
+     x = 0
+     do k = 1, nz
+        do j = jsc_ens, jec_ens
+           do i = isc_ens, iec_ens
+              x_ens(i,j,k) = ensemble_id *1e6 + ensemble_tile_id*1e3 + i + j * 1.e-3 + k * 1.e-6
+           enddo
+        enddo
+     enddo
+
+     do n = 1, ensemble_size
+        x = 0
+        call mpp_redistribute( domain_ensemble(n), x_ens, domain, x(:,:,:,n) )
+        y = 0
+        do k = 1, nz
+           do j = jsc, jec
+              do i = isc, iec
+                 y(i,j,k) = n *1e6 + tile_id*1e3 + i + j * 1.e-3 + k * 1.e-6
+              enddo
+           enddo
+        enddo
+       write(mesg,'(a,i4)') "cubic_grid redistribute from ensemble", n
+       if(.not.compare_result4( x(isc:iec,jsc:jec,:,n), y(isc:iec,jsc:jec,:))) call &
+           mpp_error(FATAL, "test_mpp_redistribute: failed cubic grid 32-bit check")
+     enddo
+
+     ! redistribute data to each ensemble.
+     deallocate(x,y,x_ens)
+     allocate(x(isd:ied, jsd:jed, nz, ensemble_size))
+     allocate(x_ens(isd_ens:ied_ens, jsd_ens:jed_ens, nz))
+     allocate(y(isd_ens:ied_ens, jsd_ens:jed_ens, nz))
+
+     y = 0
+     do k = 1, nz
+        do j = jsc, jec
+           do i = isc, iec
+              x(i,j,k,:) = i + j * 1.e-3 + k * 1.e-6
+           enddo
+        enddo
+     enddo
+
+     do n = 1, ensemble_size
+        x_ens = 0
+        call mpp_redistribute(domain, x(:,:,:,n), domain_ensemble(n), x_ens)
+        y = 0
+        if( ensemble_id == n ) then
+           do k = 1, nz
+              do j = jsc_ens, jec_ens
+                 do i = isc_ens, iec_ens
+                    y(i,j,k) = i + j * 1.e-3 + k * 1.e-6
+                 enddo
+              enddo
+           enddo
+        endif
+        write(mesg,'(a,i4)') "cubic_grid redistribute to ensemble", n
+        if(.not.compare_result4( x_ens(isc_ens:iec_ens,jsc_ens:jec_ens,:), y(isc_ens:iec_ens,jsc_ens:jec_ens,:))) call &
+           mpp_error(FATAL, "test_mpp_redistribute: failed cubic grid 32-bit check")
+     enddo
+
+     deallocate(x, y, x_ens)
+     call mpp_deallocate_domain(domain)
+     do n = 1, ensemble_size
+        call mpp_deallocate_domain(domain_ensemble(n))
+     enddo
+     deallocate(domain_ensemble)
+
+  end subroutine cubic_grid_redistribute_i4
+
+  !> Tests redistribute with cubic grid and 64-bit ints
+  subroutine cubic_grid_redistribute_i8
+
+     integer              :: npes, npes_per_ensemble, npes_per_tile
+     integer              :: ensemble_id, tile_id, ensemble_tile_id
+     integer              :: i, j, p, n, ntiles, my_root_pe, k
+     integer              :: isc_ens, iec_ens, jsc_ens, jec_ens
+     integer              :: isd_ens, ied_ens, jsd_ens, jed_ens
+     integer              :: isc, iec, jsc, jec
+     integer              :: isd, ied, jsd, jed, layout_ensemble(2) = (/0,0/)
+     integer, allocatable :: my_ensemble_pelist(:), pe_start(:), pe_end(:)
+     integer, allocatable :: global_indices(:,:), layout2D(:,:)
+     integer(i8_kind),    allocatable :: x(:,:,:,:), x_ens(:,:,:), y(:,:,:)
+     integer              :: layout(2), ensemble_size = 1, layout_cubic(2) = (/0,0/)
+     type(domain2D)       :: domain
+     type(domain2D), allocatable :: domain_ensemble(:)
+     character(len=128)   :: mesg
+     integer              :: nx_cubic = 20, ny_cubic = 20
+     logical :: check
+
+     ! --- set up pelist
+     npes = mpp_npes()
+     if(mod(npes, ensemble_size) .NE. 0) call mpp_error(FATAL, &
+         "test_mpp_domains: npes is not divisible by ensemble_size")
+     npes_per_ensemble = npes/ensemble_size
+     allocate(my_ensemble_pelist(0:npes_per_ensemble-1))
+     ensemble_id = mpp_pe()/npes_per_ensemble + 1
+     do p = 0, npes_per_ensemble-1
+        my_ensemble_pelist(p) = (ensemble_id-1)*npes_per_ensemble + p
+     enddo
+
+     call mpp_declare_pelist(my_ensemble_pelist)
+
+     !--- define a mosaic use all the pelist
+     ntiles = 6
+
+
+     if( mod(npes, ntiles) .NE. 0 ) call mpp_error(FATAL, &
+          "test_mpp_domains: npes is not divisible by ntiles")
+
+     npes_per_tile = npes/ntiles
+     tile_id = mpp_pe()/npes_per_tile + 1
+     if( npes_per_tile == layout_cubic(1) * layout_cubic(2) ) then
+        layout = layout_cubic
+     else
+        call mpp_define_layout( (/1,nx_cubic,1,ny_cubic/), npes_per_tile, layout )
+     endif
+     allocate(global_indices(4, ntiles))
+     allocate(layout2D(2, ntiles))
+     allocate(pe_start(ntiles), pe_end(ntiles))
+     do n = 1, ntiles
+       global_indices(:,n) = (/1,nx_cubic,1,ny_cubic/)
+       layout2D(:,n)         = layout
+     end do
+
+     do n = 1, ntiles
+        pe_start(n) = (n-1)*npes_per_tile
+        pe_end(n)   = n*npes_per_tile-1
+     end do
+
+     call define_cubic_mosaic("cubic_grid", domain, (/nx_cubic,nx_cubic,nx_cubic,nx_cubic,nx_cubic,nx_cubic/), &
+                              (/ny_cubic,ny_cubic,ny_cubic,ny_cubic,ny_cubic,ny_cubic/), &
+                                global_indices, layout2D, pe_start, pe_end )
+
+     allocate(domain_ensemble(ensemble_size))
+     !-- define domain for each ensemble
+     call mpp_set_current_pelist( my_ensemble_pelist )
+     if( mod(npes_per_ensemble, ntiles) .NE. 0 ) call mpp_error(FATAL, &
+          "test_mpp_domains: npes_per_ensemble is not divisible by ntiles")
+     npes_per_tile = npes_per_ensemble/ntiles
+     my_root_pe = my_ensemble_pelist(0)
+     ensemble_tile_id = (mpp_pe() - my_root_pe)/npes_per_tile + 1
+
+     if( npes_per_tile == layout_ensemble(1) * layout_ensemble(2) ) then
+        layout = layout_ensemble
+     else
+        call mpp_define_layout( (/1,nx_cubic,1,ny_cubic/), npes_per_tile, layout )
+     endif
+     do n = 1, ntiles
+       global_indices(:,n) = (/1,nx_cubic,1,ny_cubic/)
+       layout2D(:,n)         = layout
+     end do
+
+     do n = 1, ntiles
+        pe_start(n) = my_root_pe + (n-1)*npes_per_tile
+        pe_end(n)   = my_root_pe + n*npes_per_tile-1
+     end do
+
+     call define_cubic_mosaic("cubic_grid",domain_ensemble(ensemble_id),(/nx_cubic,nx_cubic,nx_cubic,nx_cubic,nx_cubic,nx_cubic/)&
+                             ,(/ny_cubic,ny_cubic,ny_cubic,ny_cubic,ny_cubic,ny_cubic/), &
+                                global_indices, layout2D, pe_start, pe_end )
+
+     call mpp_set_current_pelist()
+     do n = 1, ensemble_size
+        call mpp_broadcast_domain(domain_ensemble(n))
+     enddo
+
+     call mpp_get_data_domain( domain_ensemble(ensemble_id), isd_ens, ied_ens, jsd_ens, jed_ens)
+     call mpp_get_compute_domain( domain_ensemble(ensemble_id), isc_ens, iec_ens, jsc_ens, jec_ens)
+     call mpp_get_data_domain( domain, isd, ied, jsd, jed)
+     call mpp_get_compute_domain( domain, isc, iec, jsc, jec)
+
+     allocate(x_ens(isd_ens:ied_ens, jsd_ens:jed_ens, nz))
+     allocate(x(isd:ied, jsd:jed, nz, ensemble_size))
+     allocate(y(isd:ied, jsd:jed, nz))
+
+     x = 0
+     do k = 1, nz
+        do j = jsc_ens, jec_ens
+           do i = isc_ens, iec_ens
+              x_ens(i,j,k) = ensemble_id *1e6 + ensemble_tile_id*1e3 + i + j * 1.e-3 + k * 1.e-6
+           enddo
+        enddo
+     enddo
+
+     do n = 1, ensemble_size
+        x = 0
+        call mpp_redistribute( domain_ensemble(n), x_ens, domain, x(:,:,:,n) )
+        y = 0
+        do k = 1, nz
+           do j = jsc, jec
+              do i = isc, iec
+                 y(i,j,k) = n *1e6 + tile_id*1e3 + i + j * 1.e-3 + k * 1.e-6
+              enddo
+           enddo
+        enddo
+       write(mesg,'(a,i4)') "cubic_grid redistribute from ensemble", n
+       if(.not.compare_result8( x(isc:iec,jsc:jec,:,n), y(isc:iec,jsc:jec,:))) call &
+           mpp_error(FATAL, "test_mpp_redistribute: failed cubic grid 32-bit check")
+     enddo
+
+     ! redistribute data to each ensemble.
+     deallocate(x,y,x_ens)
+     allocate(x(isd:ied, jsd:jed, nz, ensemble_size))
+     allocate(x_ens(isd_ens:ied_ens, jsd_ens:jed_ens, nz))
+     allocate(y(isd_ens:ied_ens, jsd_ens:jed_ens, nz))
+
+     y = 0
+     do k = 1, nz
+        do j = jsc, jec
+           do i = isc, iec
+              x(i,j,k,:) = i + j * 1.e-3 + k * 1.e-6
+           enddo
+        enddo
+     enddo
+
+     do n = 1, ensemble_size
+        x_ens = 0
+        call mpp_redistribute(domain, x(:,:,:,n), domain_ensemble(n), x_ens)
+        y = 0
+        if( ensemble_id == n ) then
+           do k = 1, nz
+              do j = jsc_ens, jec_ens
+                 do i = isc_ens, iec_ens
+                    y(i,j,k) = i + j * 1.e-3 + k * 1.e-6
+                 enddo
+              enddo
+           enddo
+        endif
+        write(mesg,'(a,i4)') "cubic_grid redistribute to ensemble", n
+        if(.not.compare_result8( x_ens(isc_ens:iec_ens,jsc_ens:jec_ens,:), y(isc_ens:iec_ens,jsc_ens:jec_ens,:))) call &
+           mpp_error(FATAL, "test_mpp_redistribute: failed cubic grid 32-bit check")
+     enddo
+
+     deallocate(x, y, x_ens)
+     call mpp_deallocate_domain(domain)
+     do n = 1, ensemble_size
+        call mpp_deallocate_domain(domain_ensemble(n))
+     enddo
+     deallocate(domain_ensemble)
+
+  end subroutine cubic_grid_redistribute_i8
+
+  ! define mosaic domain for cubic grid
+  subroutine define_cubic_mosaic(type, domain, ni, nj, global_indices, layout, pe_start, pe_end)
+    character(len=*), intent(in)  :: type
+    type(domain2d), intent(inout) :: domain
+    integer,        intent(in)    :: global_indices(:,:), layout(:,:)
+    integer,        intent(in)    :: ni(:), nj(:)
+    integer,        intent(in)    :: pe_start(:), pe_end(:)
+    integer, dimension(12)        :: istart1, iend1, jstart1, jend1, tile1
+    integer, dimension(12)        :: istart2, iend2, jstart2, jend2, tile2
+    integer                       :: ntiles, num_contact, msize(2)
+    integer                       :: nhalo=1, shalo=1, ehalo=1, whalo=1
+
+    ntiles = 6
+    num_contact = 12
+    if(size(pe_start(:)) .NE. 6 .OR. size(pe_end(:)) .NE. 6 ) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of pe_start and pe_end should be 6")
+    if(size(global_indices,1) .NE. 4) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of first dimension of global_indices should be 4")
+    if(size(global_indices,2) .NE. 6) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of second dimension of global_indices should be 6")
+    if(size(layout,1) .NE. 2) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of first dimension of layout should be 2")
+    if(size(layout,2) .NE. 6) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of second dimension of layout should be 6")
+    if(size(ni(:)) .NE. 6 .OR. size(nj(:)) .NE. 6) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of ni and nj should be 6")
+
+    !--- Contact line 1, between tile 1 (EAST) and tile 2 (WEST)
+    tile1(1) = 1; tile2(1) = 2
+    istart1(1) = ni(1);  iend1(1) = ni(1);  jstart1(1) = 1;      jend1(1) = nj(1)
+    istart2(1) = 1;      iend2(1) = 1;      jstart2(1) = 1;      jend2(1) = nj(2)
+    !--- Contact line 2, between tile 1 (NORTH) and tile 3 (WEST)
+    tile1(2) = 1; tile2(2) = 3
+    istart1(2) = 1;      iend1(2) = ni(1);  jstart1(2) = nj(1);  jend1(2) = nj(1)
+    istart2(2) = 1;      iend2(2) = 1;      jstart2(2) = nj(3);  jend2(2) = 1
+    !--- Contact line 3, between tile 1 (WEST) and tile 5 (NORTH)
+    tile1(3) = 1; tile2(3) = 5
+    istart1(3) = 1;      iend1(3) = 1;      jstart1(3) = 1;      jend1(3) = nj(1)
+    istart2(3) = ni(5);  iend2(3) = 1;      jstart2(3) = nj(5);  jend2(3) = nj(5)
+    !--- Contact line 4, between tile 1 (SOUTH) and tile 6 (NORTH)
+    tile1(4) = 1; tile2(4) = 6
+    istart1(4) = 1;      iend1(4) = ni(1);  jstart1(4) = 1;      jend1(4) = 1
+    istart2(4) = 1;      iend2(4) = ni(6);  jstart2(4) = nj(6);  jend2(4) = nj(6)
+    !--- Contact line 5, between tile 2 (NORTH) and tile 3 (SOUTH)
+    tile1(5) = 2; tile2(5) = 3
+    istart1(5) = 1;      iend1(5) = ni(2);  jstart1(5) = nj(2);  jend1(5) = nj(2)
+    istart2(5) = 1;      iend2(5) = ni(3);  jstart2(5) = 1;      jend2(5) = 1
+    !--- Contact line 6, between tile 2 (EAST) and tile 4 (SOUTH)
+    tile1(6) = 2; tile2(6) = 4
+    istart1(6) = ni(2);  iend1(6) = ni(2);  jstart1(6) = 1;      jend1(6) = nj(2)
+    istart2(6) = ni(4);  iend2(6) = 1;      jstart2(6) = 1;      jend2(6) = 1
+    !--- Contact line 7, between tile 2 (SOUTH) and tile 6 (EAST)
+    tile1(7) = 2; tile2(7) = 6
+    istart1(7) = 1;      iend1(7) = ni(2);  jstart1(7) = 1;      jend1(7) = 1
+    istart2(7) = ni(6);  iend2(7) = ni(6);  jstart2(7) = nj(6);  jend2(7) = 1
+    !--- Contact line 8, between tile 3 (EAST) and tile 4 (WEST)
+    tile1(8) = 3; tile2(8) = 4
+    istart1(8) = ni(3);  iend1(8) = ni(3);  jstart1(8) = 1;      jend1(8) = nj(3)
+    istart2(8) = 1;      iend2(8) = 1;      jstart2(8) = 1;      jend2(8) = nj(4)
+    !--- Contact line 9, between tile 3 (NORTH) and tile 5 (WEST)
+    tile1(9) = 3; tile2(9) = 5
+    istart1(9) = 1;      iend1(9) = ni(3);  jstart1(9) = nj(3);  jend1(9) = nj(3)
+    istart2(9) = 1;      iend2(9) = 1;      jstart2(9) = nj(5);  jend2(9) = 1
+    !--- Contact line 10, between tile 4 (NORTH) and tile 5 (SOUTH)
+    tile1(10) = 4; tile2(10) = 5
+    istart1(10) = 1;     iend1(10) = ni(4); jstart1(10) = nj(4); jend1(10) = nj(4)
+    istart2(10) = 1;     iend2(10) = ni(5); jstart2(10) = 1;     jend2(10) = 1
+    !--- Contact line 11, between tile 4 (EAST) and tile 6 (SOUTH)
+    tile1(11) = 4; tile2(11) = 6
+    istart1(11) = ni(4); iend1(11) = ni(4); jstart1(11) = 1;     jend1(11) = nj(4)
+    istart2(11) = ni(6); iend2(11) = 1;     jstart2(11) = 1;     jend2(11) = 1
+    !--- Contact line 12, between tile 5 (EAST) and tile 6 (WEST)
+    tile1(12) = 5; tile2(12) = 6
+    istart1(12) = ni(5); iend1(12) = ni(5); jstart1(12) = 1;     jend1(12) = nj(5)
+    istart2(12) = 1;     iend2(12) = 1;     jstart2(12) = 1;     jend2(12) = nj(6)
+    msize(1) = maxval(ni(:)/layout(1,:)) + whalo + ehalo + 1 ! make sure memory domain size is no smaller than
+    msize(2) = maxval(nj(:)/layout(2,:)) + shalo + nhalo + 1 ! data domain size
+
+    call mpp_define_mosaic(global_indices, layout, domain, ntiles, num_contact, tile1, tile2, &
+      istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+      pe_start, pe_end, symmetry = .true., whalo=whalo, ehalo=ehalo,   &
+      shalo=shalo, nhalo=nhalo, name = trim(type), memory_size = msize  )
+
+  end subroutine define_cubic_mosaic
+
+  !> checks if i4 arrays are equal
+  function compare_result4(a, b)
+    integer(i4_kind), intent(in), dimension(:,:,:)  :: a, b
+    logical                                         :: compare_result4
+    if(size(a,1).ne.size(b,1) .or. size(a,2).ne.size(b,2) .or. size(a,3).ne.size(b,3)) call &
+                        mpp_error(FATAL, "test_mpp_redistribute: comparing different sized arrays")
+    compare_result4 = all(a.eq.b)
+  end function compare_result4
+
+  !> checks if i8 arrays are equal
+  function compare_result8(a, b)
+    integer(i8_kind), intent(in), dimension(:,:,:)  :: a, b
+    logical                                         :: compare_result8
+    if(size(a,1).ne.size(b,1) .or. size(a,2).ne.size(b,2) .or. size(a,3).ne.size(b,3)) call &
+                        mpp_error(FATAL, "test_mpp_redistribute: comparing different sized arrays")
+    compare_result8 = all(a.eq.b)
+  end function compare_result8
+end program test_mpp_redistribute
\ No newline at end of file
diff --git a/test_fms/mpp/test_redistribute_int.sh b/test_fms/mpp/test_redistribute_int.sh
new file mode 100755
index 0000000000..4814def2b8
--- /dev/null
+++ b/test_fms/mpp/test_redistribute_int.sh
@@ -0,0 +1,57 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Ryan Mulhall 2020
+
+# Set common test settings.
+. ../test_common.sh
+
+skip_test="no"
+
+# Get the number of available CPUs on the system
+if [ $(command -v nproc) ]
+then
+    # Looks like a linux system
+    nProc=$(nproc)
+elif [ $(command -v sysctl) ]
+then
+    # Looks like a Mac OS X system
+    nProc=$(sysctl -n hw.physicalcpu)
+else
+    nProc=-1
+fi
+
+# Do we need to oversubscribe
+if [ ${nProc} -lt 0 ]
+then
+    # Couldn't get the number of CPUs, skip the test.
+    skip_test="skip"
+elif [ $nProc -lt 4 ]
+then
+    # Need to oversubscribe the MPI
+    run_test test_redistribute_int 6 $skip_test "true"
+fi
+
+touch input.nml
+run_test test_redistribute_int 6 $skip_test
diff --git a/test_fms/mpp/test_stderr b/test_fms/mpp/test_stderr
new file mode 100755
index 0000000000..32d0186d42
--- /dev/null
+++ b/test_fms/mpp/test_stderr
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_stderr - temporary wrapper script for .libs/test_stderr
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_stderr program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_stderr:test_stderr:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_stderr:test_stderr:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_stderr:test_stderr:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_stderr'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_stdout b/test_fms/mpp/test_stdout
new file mode 100755
index 0000000000..acb7b39396
--- /dev/null
+++ b/test_fms/mpp/test_stdout
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_stdout - temporary wrapper script for .libs/test_stdout
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_stdout program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_stdout:test_stdout:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_stdout:test_stdout:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_stdout:test_stdout:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_stdout'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_system_clock b/test_fms/mpp/test_system_clock
new file mode 100755
index 0000000000..9496ed3274
--- /dev/null
+++ b/test_fms/mpp/test_system_clock
@@ -0,0 +1,210 @@
+#! /bin/sh
+
+# test_system_clock - temporary wrapper script for .libs/test_system_clock
+# Generated by libtool (GNU libtool) 2.4.2
+#
+# The test_system_clock program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=""
+
+# This environment variable determines our operation mode.
+if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='2.4.2'
+  notinst_deplibs=' ../../libFMS/libFMS.la'
+else
+  # When we are sourced in execute mode, $file and $ECHO are already set.
+  if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+    file="$0"
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+    ECHO="printf %s\\n"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string --lt-
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's ../../libtool value, followed by no.
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=$0
+  shift
+  for lt_opt
+  do
+    case "$lt_opt" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
+        test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
+        lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
+        cat "$lt_dump_D/$lt_dump_F"
+        exit 0
+      ;;
+    --lt-*)
+        $ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n "$lt_option_debug"; then
+    echo "test_system_clock:test_system_clock:${LINENO}: libtool wrapper (GNU libtool) 2.4.2" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    $ECHO "test_system_clock:test_system_clock:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
+    lt_dump_args_N=`expr $lt_dump_args_N + 1`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+
+      if test -n "$lt_option_debug"; then
+        $ECHO "test_system_clock:test_system_clock:${LINENO}: newargv[0]: $progdir/$program" 1>&2
+        func_lt_dump_args ${1+"$@"} 1>&2
+      fi
+      exec "$progdir/$program" ${1+"$@"}
+
+      $ECHO "$0: cannot exec $program $*" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from $@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case " $* " in
+  *\ --lt-*)
+    for lt_wr_arg
+    do
+      case $lt_wr_arg in
+      --lt-*) ;;
+      *) set x "$@" "$lt_wr_arg"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core ${1+"$@"}
+}
+
+  # Parse options
+  func_parse_lt_options "$0" ${1+"$@"}
+
+  # Find the directory that this script lives in.
+  thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+  test "x$thisdir" = "x$file" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
+  while test -n "$file"; do
+    destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
+
+    # If there was a directory component, then change thisdir.
+    if test "x$destdir" != "x$file"; then
+      case "$destdir" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
+      *) thisdir="$thisdir/$destdir" ;;
+      esac
+    fi
+
+    file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
+    file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
+  if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
+    # special case for '.'
+    if test "$thisdir" = "."; then
+      thisdir=`pwd`
+    fi
+    # remove .libs from thisdir
+    case "$thisdir" in
+    *[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
+    .libs )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=`cd "$thisdir" && pwd`
+  test -n "$absdir" && thisdir="$absdir"
+
+  program='test_system_clock'
+  progdir="$thisdir/.libs"
+
+
+  if test -f "$progdir/$program"; then
+    # Add our own library path to LD_LIBRARY_PATH
+    LD_LIBRARY_PATH="/home/Mikyung.Lee/FMS/libFMS/.libs:/opt/netcdf/4.7.4/INTEL/lib64:/opt/hdf5/1.12.0/INTEL/lib:$LD_LIBRARY_PATH"
+
+    # Some systems cannot cope with colon-terminated LD_LIBRARY_PATH
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    LD_LIBRARY_PATH=`$ECHO "$LD_LIBRARY_PATH" | /usr/bin/sed 's/::*$//'`
+
+    export LD_LIBRARY_PATH
+
+    if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
+      # Run the actual program with our arguments.
+      func_exec_program ${1+"$@"}
+    fi
+  else
+    # The program doesn't exist.
+    $ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
+    $ECHO "This script is just a wrapper for $program." 1>&2
+    $ECHO "See the libtool documentation for more information." 1>&2
+    exit 1
+  fi
+fi
diff --git a/test_fms/mpp/test_system_clock.F90 b/test_fms/mpp/test_system_clock.F90
index 2ce6437f18..92cd9a05da 100644
--- a/test_fms/mpp/test_system_clock.F90
+++ b/test_fms/mpp/test_system_clock.F90
@@ -30,7 +30,7 @@
 !! the second call returns a COUNT value greater than that of the
 !! first call.
 module include_files_mod
-#include "../../include/fms_platform.h"
+  use platform_mod
   logical :: first_call_system_clock_mpi=.TRUE.
 contains
 #include "../../mpp/include/system_clock.h"
@@ -41,7 +41,7 @@ program test_system_clock
   use mpp_mod, only : mpp_init, mpp_init_test_init_true_only, stderr, stdout, mpp_error, FATAL
   implicit none
 
-  integer(LONG_KIND) :: count1, count_rate1, count_max1, count2, count_rate2, count_max2
+  integer(i8_kind) :: count1, count_rate1, count_max1, count2, count_rate2, count_max2
   integer :: ierr
 !> Initialize mpp
   call mpp_init(test_level=mpp_init_test_init_true_only)
diff --git a/test_fms/mpp/test_update_domains_performance.F90 b/test_fms/mpp/test_update_domains_performance.F90
new file mode 100644
index 0000000000..32a8fe7857
--- /dev/null
+++ b/test_fms/mpp/test_update_domains_performance.F90
@@ -0,0 +1,1934 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @author Jessica Liptak
+!> @brief Run performance tests using blocking communications with mpp_update_domains,
+!! and non-blocking communications with mpp_start_update_domains and mpp_complete_update_domains.
+!!
+!! The test cases are 'folded_north' and 'cubic_grid'
+program test_update_domains_performance
+  use compare_data_checksums, only : compare_checksums
+  use compare_data_checksums_int, only : compare_checksums_int
+  use mpp_mod, only : FATAL, WARNING, NOTE, MPP_CLOCK_SYNC,MPP_CLOCK_DETAILED
+  use mpp_mod, only : mpp_init, mpp_pe, mpp_npes, mpp_root_pe, mpp_error
+  use mpp_mod, only : mpp_clock_id, mpp_clock_begin, mpp_clock_end, mpp_sync
+  use mpp_mod, only : mpp_init_test_requests_allocated
+  use mpp_domains_mod, only : BGRID_NE, CGRID_NE, AGRID, SCALAR_PAIR, MPP_DOMAIN_TIME
+  use mpp_domains_mod, only : domain2D
+  use mpp_domains_mod, only : mpp_get_compute_domain, mpp_get_data_domain, mpp_domains_set_stack_size
+  use mpp_domains_mod, only : mpp_domains_init, mpp_domains_exit
+  use mpp_domains_mod, only : mpp_update_domains, mpp_get_memory_domain
+  use mpp_domains_mod, only : mpp_define_layout
+  use mpp_domains_mod, only : mpp_define_mosaic
+  use mpp_domains_mod, only : NORTH, SOUTH, WEST, EAST, CENTER
+  use mpp_domains_mod, only : mpp_get_global_domain, ZERO
+  use mpp_domains_mod, only : mpp_start_update_domains, mpp_complete_update_domains
+  use mpp_io_mod, only: mpp_io_init
+  use platform_mod
+
+  implicit none
+
+  integer :: ierr, id
+  integer :: pe, npes
+  integer :: nx=64, ny=64, nz=10, stackmax=10000000
+  integer :: i, j, k, n
+  integer :: layout(2)
+  integer :: mpes = 0
+  integer :: whalo = 2, ehalo = 2, shalo = 2, nhalo = 2
+  integer :: x_cyclic_offset = 3   ! to be used in test_cyclic_offset
+  integer :: y_cyclic_offset = -4  ! to be used in test_cyclic_offset
+  character(len=32) :: warn_level = "fatal"
+  integer :: wide_halo_x = 0, wide_halo_y = 0
+  integer :: nx_cubic = 0, ny_cubic = 0
+  integer :: num_fields = 4
+  integer :: ensemble_size = 1
+  integer :: num_iter = 1
+  integer :: layout_cubic(2) = (/0,0/)
+  integer :: layout_tripolar(2) = (/0,0/)
+  integer :: layout_ensemble(2) = (/0,0/)
+  logical :: do_sleep = .false.
+  logical :: mix_2D_3D = .false.
+  !> Initialize mpp and mpp IO modules
+  call mpp_init(test_level=mpp_init_test_requests_allocated)
+  call mpp_io_init()
+  call mpp_domains_init(MPP_DOMAIN_TIME)
+  call mpp_domains_set_stack_size(stackmax)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  !> run the tests
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Calling 64-bit real update_domains_performance tests <-------------------'
+  call update_domains_performance_r8('Folded-north')
+  call update_domains_performance_r8('Cubic-Grid')
+  call update_domains_performance_r8('Single-Tile')
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Finished 64-bit real update_domains_performance tests <-------------------'
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Calling 32-bit real update_domains_performance tests <-------------------'
+  call update_domains_performance_r4('Folded-north')
+  call update_domains_performance_r4('Cubic-Grid')
+  call update_domains_performance_r4('Single-Tile')
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Finished 32-bit real update_domains_performance tests'
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Calling 64-bit integer update_domains_performance tests <-------------------'
+  call update_domains_performance_i8('Folded-north')
+  call update_domains_performance_i8('Cubic-Grid')
+  call update_domains_performance_i8('Single-Tile')
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Finished 64-bit integer update_domains_performance tests <-------------------'
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Finished 64-bit integer update_domains_performance tests'
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Calling 32-bit integer update_domains_performance tests <-------------------'
+  call update_domains_performance_i4('Folded-north')
+  call update_domains_performance_i4('Cubic-Grid')
+  call update_domains_performance_i4('Single-Tile')
+  if (mpp_pe() == mpp_root_pe()) &
+    print *, '--------------------> Finished 32-bit integer update_domains_performance tests <-------------------'
+  call mpp_domains_exit()
+  !> Finalize mpp
+  call MPI_FINALIZE(ierr)
+  contains
+
+  !> run performance tests on 64-bit real arrays
+  subroutine update_domains_performance_r8( test_type )
+    character(len=*), intent(in) :: test_type
+
+    type(domain2D) :: domain
+    integer        :: num_contact, ntiles, npes_per_tile, ntile_per_pe
+    integer        :: i, j, k, l, n, shift
+    integer        :: ism, iem, jsm, jem
+    integer        :: isc, iec, jsc, jec, isd, ied, jsd, jed
+
+    integer, allocatable, dimension(:)       :: tile
+    integer, allocatable, dimension(:)       :: pe_start, pe_end, tile1, tile2
+    integer, allocatable, dimension(:)       :: istart1, iend1, jstart1, jend1
+    integer, allocatable, dimension(:)       :: istart2, iend2, jstart2, jend2
+    integer, allocatable, dimension(:,:)     :: layout2D, global_indices
+    real(kind=r8_kind),    allocatable, dimension(:,:,:,:) :: x, x1, y, y1, x_save, y_save
+    real(kind=r8_kind),    allocatable, dimension(:,:,:,:) :: a, a1, b, b1
+    real(kind=r8_kind),     allocatable, dimension(:,:,:  ) :: a1_2D, b1_2D
+    integer            :: id_update
+    integer            :: id1, id2
+    logical            :: folded_north
+    logical            :: cubic_grid, single_tile
+    character(len=3)   :: text
+    integer            :: nx_save, ny_save
+    integer            :: id_single, id_update_single
+
+    folded_north       = .false.
+    cubic_grid         = .false.
+    single_tile        = .false.
+    nx_save = nx
+    ny_save = ny
+    !--- check the test_type
+    select case(test_type)
+      case ( 'Single-Tile' )   !--- single with cyclic along x- and y-direction
+        single_tile = .true.
+        ntiles = 1
+        num_contact = 2
+      case ( 'Folded-north' )
+        ntiles = 1
+        num_contact = 2
+        folded_north = .true.
+      case ( 'Cubic-Grid' )
+        if( nx_cubic == 0 ) then
+          call mpp_error(NOTE,'update_domains_performance_r8: for Cubic_grid mosaic, nx_cubic is zero, '//&
+                        'No test is done for Cubic-Grid mosaic. ' )
+          return
+        endif
+        if( nx_cubic .NE. ny_cubic ) then
+          call mpp_error(NOTE,'update_domains_performance_r8: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
+                        'No test is done for Cubic-Grid mosaic. ' )
+          return
+        endif
+
+        nx = nx_cubic
+        ny = ny_cubic
+        ntiles = 6
+        num_contact = 12
+        cubic_grid = .true.
+
+      case default
+        call mpp_error(FATAL, 'update_domains_performancez_r8: no such test: '//test_type)
+    end select
+
+    allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
+    if( mod(npes, ntiles) == 0 ) then
+      npes_per_tile = npes/ntiles
+      text=""
+      write(text,"(I3)") npes_per_tile
+      call mpp_error(NOTE, 'update_domains_performance_r8: For Mosaic "'//trim(test_type)// &
+                    '", each tile will be distributed over '//text//' processors.')
+      ntile_per_pe = 1
+      allocate(tile(ntile_per_pe))
+      tile = pe/npes_per_tile+1
+      if(cubic_grid) then
+        call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+      else
+        call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+      endif
+      do n = 1, ntiles
+        pe_start(n) = (n-1)*npes_per_tile
+        pe_end(n)   = n*npes_per_tile-1
+      end do
+    else if ( mod(ntiles, npes) == 0 ) then
+      ntile_per_pe = ntiles/npes
+      text=""
+      write(text,"(I3)") ntile_per_pe
+      call mpp_error(NOTE, 'update_domains_performance_r8: For Mosaic "'//trim(test_type)// &
+                      '", there will be '//text//' tiles on each processor.')
+      allocate(tile(ntile_per_pe))
+      do n = 1, ntile_per_pe
+        tile(n) = pe*ntile_per_pe + n
+      end do
+      do n = 1, ntiles
+        pe_start(n) = (n-1)/ntile_per_pe
+        pe_end(n)   = pe_start(n)
+      end do
+      layout = 1
+    else
+      call mpp_error(NOTE,'update_domains_performance_r8: npes should be multiple of ntiles or ' // &
+          'ntiles should be multiple of npes. No test is done for '//trim(test_type) )
+      return
+    end if
+
+    do n = 1, ntiles
+      global_indices(:,n) = (/1,nx,1,ny/)
+      layout2D(:,n)         = layout
+    end do
+
+    allocate(tile1(num_contact), tile2(num_contact) )
+    allocate(istart1(num_contact), iend1(num_contact), jstart1(num_contact), jend1(num_contact) )
+    allocate(istart2(num_contact), iend2(num_contact), jstart2(num_contact), jend2(num_contact) )
+
+    !--- define domain
+    if(single_tile) then
+      !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)
+      tile1(1) = 1; tile2(1) = 1
+      istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
+      istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
+      !--- Contact line 2, between tile 1 (SOUTH) and tile 1 (NORTH)  --- cyclic
+      tile1(2) = 1; tile2(2) = 1
+      istart1(2) = 1;  iend1(2) = nx; jstart1(2) = 1;   jend1(2) = 1
+      istart2(2) = 1;  iend2(2) = nx; jstart2(2) = ny;  jend2(2) = ny
+      call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
+          istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+          pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+          name = test_type, symmetry = .false. )
+    else if(folded_north) then
+      !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)  --- cyclic
+      tile1(1) = 1; tile2(1) = 1
+      istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
+      istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
+      !--- Contact line 2, between tile 1 (NORTH) and tile 1 (NORTH)  --- folded-north-edge
+      tile1(2) = 1; tile2(2) = 1
+      istart1(2) = 1;  iend1(2) = nx/2;   jstart1(2) = ny;  jend1(2) = ny
+      istart2(2) = nx; iend2(2) = nx/2+1; jstart2(2) = ny;  jend2(2) = ny
+      call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
+                            istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+                            pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+                            name = test_type, symmetry = .false.  )
+    else if( cubic_grid ) then
+      call define_cubic_mosaic(test_type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
+                              global_indices, layout2D, pe_start, pe_end )
+    endif
+
+    !--- setup data
+    call mpp_get_compute_domain( domain, isc, iec, jsc, jec )
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+    call mpp_get_memory_domain   ( domain, ism, iem, jsm, jem )
+    allocate( x (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    allocate( x_save (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    allocate( a (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    x = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec
+          do i = isc, iec
+              x(i, j, k, l) = tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          enddo
+        enddo
+      enddo
+    enddo
+
+    a  = x
+    x_save = x
+
+    if(num_fields<1) then
+      call mpp_error(FATAL, "update_domains_performanc_r8: num_fields must be a positive integer")
+    endif
+
+    id1 = mpp_clock_id( test_type, flags=MPP_CLOCK_SYNC)
+    id_single = mpp_clock_id( test_type//' non-blocking', flags=MPP_CLOCK_SYNC)
+
+    call mpp_clock_begin(id1)
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id1)
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      id_update_single =  mpp_start_update_domains(a, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+
+    !---- sleep some time for non-blocking.
+    if(do_sleep) call sleep(1)
+
+    id1 = mpp_clock_id( test_type//' group', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( test_type//' group non-blocking', flags=MPP_CLOCK_SYNC )
+
+    if(ntile_per_pe == 1) then
+      allocate( x1(ism:iem,jsm:jem,nz, num_fields) )
+      allocate( a1(ism:iem,jsm:jem,nz, num_fields) )
+      if(mix_2D_3D) allocate( a1_2D(ism:iem,jsm:jem,num_fields) )
+
+      do n = 1, num_iter
+        do l = 1, num_fields
+          x1(:,:,:,l) = x_save(:,:,:,1)
+          a1(:,:,:,l) = x_save(:,:,:,1)
+          if(mix_2D_3D) a1_2D(:,:,l) = x_save(:,:,1,1)
+        enddo
+
+        call mpp_clock_begin(id1)
+        do l = 1, num_fields
+          call mpp_update_domains( x1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id1)
+
+        ! non-blocking update
+        call mpp_clock_begin(id2)
+        if( n == 1 ) then
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update =  mpp_start_update_domains(a1_2D(:,:,l), domain, complete=.false., tile_count=1)
+            id_update =  mpp_start_update_domains(a1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+          enddo
+        else
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), domain, &
+              update_id=id_update, complete=.false., tile_count=1)
+              id_update = mpp_start_update_domains(a1(:,:,:,l), domain, update_id=id_update, &
+                                                   complete=l==num_fields, tile_count=1)
+          enddo
+        endif
+        call mpp_clock_end  (id2)
+
+        !---- sleep some time for non-blocking.
+        if(do_sleep) call sleep(1)
+
+        call mpp_clock_begin(id2)
+        do l = 1, num_fields
+          if(mix_2D_3D) call mpp_complete_update_domains(id_update, a1_2D(:,:,l), domain, &
+                                                         complete=.false., tile_count=1)
+          call mpp_complete_update_domains(id_update, a1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id2)
+
+
+        !--- compare checksum
+        do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), test_type//' X'//text)
+        enddo
+        if(mix_2D_3D)call compare_checksums( x1(:,:,1,:), a1_2D(:,:,:), test_type//' X 2D')
+      enddo
+      deallocate(x1, a1)
+      if(mix_2D_3D) deallocate(a1_2D)
+    endif
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      call mpp_complete_update_domains(id_update_single, a, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type)
+    deallocate(x, a, x_save)
+    !------------------------------------------------------------------
+    !              vector update : BGRID_NE, one extra point in each direction for cubic-grid
+    !------------------------------------------------------------------
+    !--- setup data
+    shift = 0
+    if(single_tile .or. folded_north) then
+      shift = 0
+    else
+      shift = 1
+    endif
+
+    allocate( x (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( y (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( x_save (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( y_save (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( a (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( b (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    x = 0
+    y = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec+shift
+          do i = isc, iec+shift
+            x(i,j,k,l) = 1.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+            y(i,j,k,l) = 2.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          end do
+        end do
+      end do
+    enddo
+    a  = x; b  = y
+    x_save  = x; y_save  = y
+
+    id1 = mpp_clock_id( trim(test_type)//' BGRID', flags=MPP_CLOCK_SYNC )
+    id_single = mpp_clock_id( trim(test_type)//' BGRID non-blocking', flags=MPP_CLOCK_SYNC )
+    !--- blocking update
+    call mpp_clock_begin(id1)
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, y, domain, gridtype=BGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end  (id1)
+
+    !--- non-blocking update
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      id_update_single =  mpp_start_update_domains(a, b, domain, gridtype=BGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+
+    !---- sleep some time for non-blocking.
+    if (do_sleep) call sleep(1)
+
+    id1 = mpp_clock_id( trim(test_type)//' BGRID group', flags=MPP_CLOCK_SYNC)
+    id2 = mpp_clock_id( trim(test_type)//' BGRID group non-blocking', flags=MPP_CLOCK_SYNC)
+    if(ntile_per_pe == 1) then
+      allocate( x1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
+      allocate( y1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
+      allocate( a1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
+      allocate( b1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
+      if(mix_2D_3D) then
+        allocate( a1_2D(ism:iem+shift,jsm:jem+shift,num_fields) )
+        allocate( b1_2D(ism:iem+shift,jsm:jem+shift,num_fields) )
+      endif
+
+      do n = 1, num_iter
+        do l = 1, num_fields
+          x1(:,:,:,l) = x_save(:,:,:,1)
+          a1(:,:,:,l) = x_save(:,:,:,1)
+          y1(:,:,:,l) = y_save(:,:,:,1)
+          b1(:,:,:,l) = y_save(:,:,:,1)
+          if(mix_2D_3D) then
+              a1_2D(:,:,l) = x_save(:,:,1,1)
+              b1_2D(:,:,l) = y_save(:,:,1,1)
+          endif
+        enddo
+
+        call mpp_clock_begin(id1)
+        do l = 1, num_fields
+          call mpp_update_domains( x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=BGRID_NE, &
+                                  complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id1)
+
+        !--- non-blocking update
+        call mpp_clock_begin(id2)
+        if( n == 1 ) then
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update =  mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, &
+                          gridtype=BGRID_NE, complete=.false.)
+            id_update = mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, &
+                        gridtype=BGRID_NE, complete=l==num_fields, tile_count=1)
+          enddo
+        else
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, gridtype=BGRID_NE, &
+                              update_id=id_update, complete=.false.)
+            id_update =  mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, gridtype=BGRID_NE, &
+                         update_id=id_update, complete=l==num_fields, tile_count=1)
+          enddo
+        endif
+        call mpp_clock_end  (id2)
+
+        !---- sleep some time for non-blocking.
+        if(do_sleep) call sleep(1)
+
+        call mpp_clock_begin(id2)
+        do l = 1, num_fields
+          if(mix_2D_3D)call mpp_complete_update_domains(id_update, a1_2D(:,:,l), b1_2D(:,:,l), domain, &
+                                            gridtype=BGRID_NE, complete=.false.)
+          call mpp_complete_update_domains(id_update, a1(:,:,:,l), b1(:,:,:,l), domain, &
+                                            gridtype=BGRID_NE, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id2)
+
+        !--- compare checksum
+        do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), test_type//' BGRID X'//text)
+          call compare_checksums( y1(:,:,:,l), b1(:,:,:,l), test_type//' BGRID Y'//text)
+          if(mix_2D_3D) then
+            call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), test_type//' BGRID X'//text)
+            call compare_checksums( y1(:,:,:,1), b1(:,:,:,1), test_type//' BGRID Y'//text)
+          endif
+        enddo
+        if(mix_2D_3D) then
+          call compare_checksums( x1(:,:,1,:), a1_2D(:,:,:), test_type//' BGRID X 2D')
+          call compare_checksums( y1(:,:,1,:), b1_2D(:,:,:), test_type//' BGRID Y 2D')
+        endif
+      enddo
+      deallocate(x1, y1, a1, b1)
+      if(mix_2D_3D) deallocate(a1_2D, b1_2D)
+    endif
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=BGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end(id_single)
+
+    !--- compare checksums
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type//' BGRID X')
+    call compare_checksums( y(:,:,:,1), b(:,:,:,1), test_type//' BGRID Y')
+
+    deallocate(x, y, a, b, x_save, y_save)
+    !------------------------------------------------------------------
+    !              vector update : CGRID_NE, one extra point in each direction for cubic-grid
+    !------------------------------------------------------------------
+    allocate( x (ism:iem+shift,jsm:jem  ,nz,ntile_per_pe) )
+    allocate( y (ism:iem  ,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( a (ism:iem+shift,jsm:jem  ,nz,ntile_per_pe) )
+    allocate( b (ism:iem  ,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( x_save (ism:iem+shift,jsm:jem  ,nz,ntile_per_pe) )
+    allocate( y_save (ism:iem  ,jsm:jem+shift,nz,ntile_per_pe) )
+
+    x = 0
+    y = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec
+          do i = isc, iec+shift
+            x(i,j,k,l) = 1.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          end do
+        end do
+        do j = jsc, jec+shift
+          do i = isc, iec
+            y(i,j,k,l) = 2.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          end do
+        end do
+      end do
+    enddo
+
+    a  = x; b  = y
+    x_save  = x; y_save  = y
+
+    id1 = mpp_clock_id( trim(test_type)//' CGRID', flags=MPP_CLOCK_SYNC )
+    id_single = mpp_clock_id( trim(test_type)//' CGRID non-blocking', flags=MPP_CLOCK_SYNC )
+
+    call mpp_clock_begin(id1)
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, y, domain, gridtype=CGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end  (id1)
+
+    !--- non-blocking update
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      id_update_single =  mpp_start_update_domains(a, b, domain, gridtype=CGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+
+    !---- sleep some time for non-blocking.
+    if(do_sleep) call sleep(1)
+
+    id1 = mpp_clock_id( trim(test_type)//' CGRID group', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( trim(test_type)//' CGRID group non-blocking', flags=MPP_CLOCK_SYNC )
+
+    if(ntile_per_pe == 1) then
+      allocate( x1(ism:iem+shift,jsm:jem      ,nz,num_fields) )
+      allocate( y1(ism:iem      ,jsm:jem+shift,nz,num_fields) )
+      allocate( a1(ism:iem+shift,jsm:jem      ,nz,num_fields) )
+      allocate( b1(ism:iem      ,jsm:jem+shift,nz,num_fields) )
+      if(mix_2D_3D) then
+        allocate( a1_2D(ism:iem+shift,jsm:jem      ,num_fields) )
+        allocate( b1_2D(ism:iem      ,jsm:jem+shift,num_fields) )
+      endif
+
+      do n = 1, num_iter
+        do l = 1, num_fields
+          x1(:,:,:,l) = x_save(:,:,:,1)
+          a1(:,:,:,l) = x_save(:,:,:,1)
+          y1(:,:,:,l) = y_save(:,:,:,1)
+          b1(:,:,:,l) = y_save(:,:,:,1)
+          if(mix_2D_3D) then
+            a1_2D(:,:,l) = x_save(:,:,1,1)
+            b1_2D(:,:,l) = y_save(:,:,1,1)
+          endif
+        enddo
+
+        call mpp_clock_begin(id1)
+        do l = 1, num_fields
+          call mpp_update_domains( x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=CGRID_NE, &
+                                  complete=l==num_fields, tile_count=1 )
+        enddo
+        call mpp_clock_end  (id1)
+
+        !--- non-blocking update
+        call mpp_clock_begin(id2)
+        if( n == 1 ) then
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, &
+              gridtype=CGRID_NE, complete=.false.)
+            id_update = mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, &
+                        gridtype=CGRID_NE, complete=l==num_fields, tile_count=1)
+          enddo
+        else
+          do l = 1, num_fields
+            if(mix_2D_3D)id_update = mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, gridtype=CGRID_NE, &
+                          update_id=id_update, complete=.false., tile_count=1)
+            id_update = mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, gridtype=CGRID_NE, &
+                          update_id=id_update, complete=l==num_fields, tile_count=1)
+          enddo
+        endif
+        call mpp_clock_end  (id2)
+
+        !---- sleep some time for non-blocking.
+        if(do_sleep) call sleep(1)
+
+        call mpp_clock_begin(id2)
+        do l = 1, num_fields
+          if(mix_2D_3D) call mpp_complete_update_domains(id_update, a1_2D(:,:,l), b1_2D(:,:,l), domain, &
+                gridtype=CGRID_NE, complete=.false., tile_count=1)
+          call mpp_complete_update_domains(id_update, a1(:,:,:,l), b1(:,:,:,l), domain, &
+                gridtype=CGRID_NE, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id2)
+
+        !--- compare checksum
+        do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), test_type//' CGRID X'//text)
+          call compare_checksums( y1(:,:,:,l), b1(:,:,:,l), test_type//' CGRID Y'//text)
+        enddo
+        if(mix_2D_3D) then
+          call compare_checksums( x1(:,:,1,:), a1_2D(:,:,:), test_type//' BGRID X 2D')
+          call compare_checksums( y1(:,:,1,:), b1_2D(:,:,:), test_type//' BGRID Y 2D')
+        endif
+      enddo
+      deallocate(x1, y1, a1, b1)
+      if(mix_2D_3D) deallocate(a1_2D, b1_2D)
+    endif
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=CGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+
+    !--- compare checksums
+
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type//' CGRID X')
+    call compare_checksums( y(:,:,:,1), b(:,:,:,1), test_type//' CGRID Y')
+
+    deallocate(x, y, a, b, x_save, y_save)
+
+    !------------------------------------------------------------------
+    !              vector update : AGRID vector and scalar pair
+    !------------------------------------------------------------------
+    allocate( x (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( y (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( a (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( b (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( x_save (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( y_save (ism:iem,jsm:jem,nz,ntile_per_pe) )
+
+    x = 0
+    y = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec
+          do i = isc, iec+shift
+            x(i,j,k,l) = 1.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          end do
+        end do
+        do j = jsc, jec+shift
+          do i = isc, iec
+            y(i,j,k,l) = 2.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          end do
+        end do
+      end do
+    enddo
+
+    a  = x; b  = y
+    x_save  = x; y_save  = y
+    ! blocking update
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, y, domain, gridtype=AGRID, tile_count=l)
+    enddo
+
+    id_update_single =  mpp_start_update_domains(a, b, domain, gridtype=AGRID)
+    do l=1,ntile_per_pe
+      call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=AGRID, tile_count=l)
+    enddo
+
+    !--- compare checksum
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type//' AGRID X')
+    call compare_checksums( y(:,:,:,1), b(:,:,:,1), test_type//' AGRID Y')
+
+    x = x_save; y = y_save
+    a = x_save; b = y_save
+    ! blocking update
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, y, domain, gridtype=AGRID, flags = SCALAR_PAIR, tile_count=l)
+
+      id_update_single = mpp_start_update_domains(a, b, domain, gridtype=AGRID, &
+                                                  flags = SCALAR_PAIR, tile_count=l)
+      call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=AGRID, &
+                                       flags = SCALAR_PAIR, tile_count=l)
+    enddo
+    !--- compare checksums
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type//' AGRID SCALAR-PAIR X')
+    call compare_checksums( y(:,:,:,1), b(:,:,:,1), test_type//' AGRID SCALAR-PAIR Y')
+
+    deallocate(x, y, a, b, x_save, y_save)
+
+    nx = nx_save
+    ny = ny_save
+
+    deallocate(layout2D, global_indices, pe_start, pe_end, tile1, tile2)
+    deallocate(istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2 )
+
+  end subroutine update_domains_performance_r8
+
+  !> run performance tests on 32-bit real arrays
+  subroutine update_domains_performance_r4( test_type )
+    character(len=*), intent(in) :: test_type
+
+    type(domain2D) :: domain
+    integer        :: num_contact, ntiles, npes_per_tile, ntile_per_pe
+    integer        :: i, j, k, l, n, shift
+    integer        :: ism, iem, jsm, jem
+    integer        :: isc, iec, jsc, jec, isd, ied, jsd, jed
+
+    integer, allocatable, dimension(:)       :: tile
+    integer, allocatable, dimension(:)       :: pe_start, pe_end, tile1, tile2
+    integer, allocatable, dimension(:)       :: istart1, iend1, jstart1, jend1
+    integer, allocatable, dimension(:)       :: istart2, iend2, jstart2, jend2
+    integer, allocatable, dimension(:,:)     :: layout2D, global_indices
+    real(kind=r4_kind),    allocatable, dimension(:,:,:,:) :: x, x1, y, y1, x_save, y_save
+    real(kind=r4_kind),    allocatable, dimension(:,:,:,:) :: a, a1, b, b1
+    real(kind=r4_kind),    allocatable, dimension(:,:,:  ) :: a1_2D, b1_2D
+    integer            :: id_update
+    integer            :: id1, id2
+    logical            :: folded_north
+    logical            :: cubic_grid, single_tile
+    character(len=3)   :: text
+    integer            :: nx_save, ny_save
+    integer            :: id_single, id_update_single
+
+    folded_north       = .false.
+    cubic_grid         = .false.
+    single_tile        = .false.
+    nx_save = nx
+    ny_save = ny
+    !--- check the test_type
+    select case(test_type)
+      case ( 'Single-Tile' )   !--- single with cyclic along x- and y-direction
+        single_tile = .true.
+        ntiles = 1
+        num_contact = 2
+      case ( 'Folded-north' )
+        ntiles = 1
+        num_contact = 2
+        folded_north = .true.
+      case ( 'Cubic-Grid' )
+        if( nx_cubic == 0 ) then
+          call mpp_error(NOTE,'update_domains_performance_r4: for Cubic_grid mosaic, nx_cubic is zero, '//&
+                        'No test is done for Cubic-Grid mosaic. ' )
+          return
+        endif
+        if( nx_cubic .NE. ny_cubic ) then
+          call mpp_error(NOTE,'update_domains_performance_r4: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
+                         'No test is done for Cubic-Grid mosaic. ' )
+          return
+        endif
+
+        nx = nx_cubic
+        ny = ny_cubic
+        ntiles = 6
+        num_contact = 12
+        cubic_grid = .true.
+
+      case default
+        call mpp_error(FATAL, 'update_domains_performancez_r4: no such test: '//test_type)
+    end select
+
+    allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
+    if( mod(npes, ntiles) == 0 ) then
+      npes_per_tile = npes/ntiles
+      text=""
+      write(text,"(I3)") npes_per_tile
+      call mpp_error(NOTE,'update_domains_performance_r4: For Mosaic "'//trim(test_type)// &
+                     '", each tile will be distributed over '//text//' processors.')
+      ntile_per_pe = 1
+      allocate(tile(ntile_per_pe))
+      tile = pe/npes_per_tile+1
+      if(cubic_grid) then
+        call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+      else
+        call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+      endif
+      do n = 1, ntiles
+        pe_start(n) = (n-1)*npes_per_tile
+        pe_end(n)   = n*npes_per_tile-1
+      end do
+    else if ( mod(ntiles, npes) == 0 ) then
+      ntile_per_pe = ntiles/npes
+      text=""
+      write(text,"(I3)") ntile_per_pe
+      call mpp_error(NOTE,'update_domains_performance_r4: For Mosaic "'//trim(test_type)// &
+                      '", there will be '//text//' tiles on each processor.')
+      allocate(tile(ntile_per_pe))
+      do n = 1, ntile_per_pe
+        tile(n) = pe*ntile_per_pe + n
+      end do
+      do n = 1, ntiles
+        pe_start(n) = (n-1)/ntile_per_pe
+        pe_end(n)   = pe_start(n)
+      end do
+      layout = 1
+    else
+      call mpp_error(NOTE,'update_domains_performance_r4: npes should be multiple of ntiles or ' // &
+          'ntiles should be multiple of npes. No test is done for '//trim(test_type) )
+      return
+    end if
+
+    do n = 1, ntiles
+      global_indices(:,n) = (/1,nx,1,ny/)
+      layout2D(:,n)         = layout
+    end do
+
+    allocate(tile1(num_contact), tile2(num_contact) )
+    allocate(istart1(num_contact), iend1(num_contact), jstart1(num_contact), jend1(num_contact) )
+    allocate(istart2(num_contact), iend2(num_contact), jstart2(num_contact), jend2(num_contact) )
+
+    !--- define domain
+    if(single_tile) then
+      !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)
+      tile1(1) = 1; tile2(1) = 1
+       istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
+      istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
+      !--- Contact line 2, between tile 1 (SOUTH) and tile 1 (NORTH)  --- cyclic
+      tile1(2) = 1; tile2(2) = 1
+      istart1(2) = 1;  iend1(2) = nx; jstart1(2) = 1;   jend1(2) = 1
+      istart2(2) = 1;  iend2(2) = nx; jstart2(2) = ny;  jend2(2) = ny
+      call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
+           istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+           pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+           name = test_type, symmetry = .false. )
+    else if(folded_north) then
+      !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)  --- cyclic
+      tile1(1) = 1; tile2(1) = 1
+      istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
+      istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
+      !--- Contact line 2, between tile 1 (NORTH) and tile 1 (NORTH)  --- folded-north-edge
+      tile1(2) = 1; tile2(2) = 1
+      istart1(2) = 1;  iend1(2) = nx/2;   jstart1(2) = ny;  jend1(2) = ny
+      istart2(2) = nx; iend2(2) = nx/2+1; jstart2(2) = ny;  jend2(2) = ny
+      call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
+                            istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+                            pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+                            name = test_type, symmetry = .false.  )
+    else if( cubic_grid ) then
+      call define_cubic_mosaic(test_type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
+                              global_indices, layout2D, pe_start, pe_end )
+    endif
+
+    !--- setup data
+    call mpp_get_compute_domain( domain, isc, iec, jsc, jec )
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+    call mpp_get_memory_domain   ( domain, ism, iem, jsm, jem )
+    allocate( x (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    allocate( x_save (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    allocate( a (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    x = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec
+           do i = isc, iec
+              x(i, j, k, l) = tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+           enddo
+        enddo
+      enddo
+    enddo
+
+    a  = x
+    x_save = x
+
+    if(num_fields<1) then
+      call mpp_error(FATAL, "update_domains_performanc_r4: num_fields must be a positive integer")
+    endif
+
+    id1 = mpp_clock_id( test_type, flags=MPP_CLOCK_SYNC)
+    id_single = mpp_clock_id( test_type//' non-blocking', flags=MPP_CLOCK_SYNC)
+
+    call mpp_clock_begin(id1)
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, domain, tile_count=l)
+    enddo
+    call mpp_clock_end(id1)
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      id_update_single =  mpp_start_update_domains(a, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+
+    !---- sleep some time for non-blocking.
+    if(do_sleep) call sleep(1)
+
+    id1 = mpp_clock_id( test_type//' group', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( test_type//' group non-blocking', flags=MPP_CLOCK_SYNC )
+
+    if(ntile_per_pe == 1) then
+      allocate( x1(ism:iem,jsm:jem,nz, num_fields) )
+      allocate( a1(ism:iem,jsm:jem,nz, num_fields) )
+      if(mix_2D_3D) allocate( a1_2D(ism:iem,jsm:jem,num_fields) )
+
+      do n = 1, num_iter
+        do l = 1, num_fields
+          x1(:,:,:,l) = x_save(:,:,:,1)
+          a1(:,:,:,l) = x_save(:,:,:,1)
+          if(mix_2D_3D) a1_2D(:,:,l) = x_save(:,:,1,1)
+        enddo
+
+        call mpp_clock_begin(id1)
+        do l = 1, num_fields
+          call mpp_update_domains( x1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id1)
+
+        ! non-blocking update
+        call mpp_clock_begin(id2)
+        if( n == 1 ) then
+           do l = 1, num_fields
+             if(mix_2D_3D) id_update =  mpp_start_update_domains(a1_2D(:,:,l), domain, complete=.false., tile_count=1)
+             id_update = mpp_start_update_domains(a1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+           enddo
+        else
+           do l = 1, num_fields
+             if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), domain, &
+                            update_id=id_update, complete=.false., tile_count=1)
+             id_update = mpp_start_update_domains(a1(:,:,:,l), domain, update_id=id_update, &
+                                                  complete=l==num_fields, tile_count=1)
+           enddo
+        endif
+        call mpp_clock_end  (id2)
+
+        !---- sleep some time for non-blocking.
+        if(do_sleep) call sleep(1)
+
+        call mpp_clock_begin(id2)
+        do l = 1, num_fields
+          if(mix_2D_3D) call mpp_complete_update_domains(id_update, a1_2D(:,:,l), domain, complete=.false., tile_count=1)
+          call mpp_complete_update_domains(id_update, a1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id2)
+
+
+        !--- compare checksum
+        do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), test_type//' X'//text)
+        enddo
+        if(mix_2D_3D)call compare_checksums( x1(:,:,1,:), a1_2D(:,:,:), test_type//' X 2D')
+      enddo
+      deallocate(x1, a1)
+      if(mix_2D_3D) deallocate(a1_2D)
+    endif
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      call mpp_complete_update_domains(id_update_single, a, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type)
+    deallocate(x, a, x_save)
+    !------------------------------------------------------------------
+    !              vector update : BGRID_NE, one extra point in each direction for cubic-grid
+    !------------------------------------------------------------------
+    !--- set up the data
+    shift = 0
+    if(single_tile .or. folded_north) then
+      shift = 0
+    else
+      shift = 1
+    endif
+
+    allocate( x (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( y (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( x_save (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( y_save (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( a (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( b (ism:iem+shift,jsm:jem+shift,nz,ntile_per_pe) )
+    x = 0
+    y = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec+shift
+           do i = isc, iec+shift
+             x(i,j,k,l) = 1.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+             y(i,j,k,l) = 2.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+           end do
+        end do
+      end do
+    enddo
+    a  = x; b  = y
+    x_save  = x; y_save  = y
+
+    id1 = mpp_clock_id( trim(test_type)//' BGRID', flags=MPP_CLOCK_SYNC )
+    id_single = mpp_clock_id( trim(test_type)//' BGRID non-blocking', flags=MPP_CLOCK_SYNC )
+    !--- blocking update
+    call mpp_clock_begin(id1)
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, y, domain, gridtype=BGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end  (id1)
+
+    !--- non-blocking update
+    call mpp_clock_begin(id_single)
+    id_update_single =  mpp_start_update_domains(a, b, domain, gridtype=BGRID_NE)
+    call mpp_clock_end  (id_single)
+
+    !---- sleep some time for non-blocking.
+    if (do_sleep) call sleep(1)
+
+    id1 = mpp_clock_id( trim(test_type)//' BGRID group', flags=MPP_CLOCK_SYNC)
+    id2 = mpp_clock_id( trim(test_type)//' BGRID group non-blocking', flags=MPP_CLOCK_SYNC)
+    if(ntile_per_pe == 1) then
+      allocate( x1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
+      allocate( y1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
+      allocate( a1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
+      allocate( b1(ism:iem+shift,jsm:jem+shift,nz,num_fields) )
+      if(mix_2D_3D) then
+        allocate( a1_2D(ism:iem+shift,jsm:jem+shift,num_fields) )
+        allocate( b1_2D(ism:iem+shift,jsm:jem+shift,num_fields) )
+      endif
+
+      do n = 1, num_iter
+        do l = 1, num_fields
+          x1(:,:,:,l) = x_save(:,:,:,1)
+          a1(:,:,:,l) = x_save(:,:,:,1)
+          y1(:,:,:,l) = y_save(:,:,:,1)
+          b1(:,:,:,l) = y_save(:,:,:,1)
+          if(mix_2D_3D) then
+            a1_2D(:,:,l) = x_save(:,:,1,1)
+            b1_2D(:,:,l) = y_save(:,:,1,1)
+          endif
+        enddo
+
+        call mpp_clock_begin(id1)
+        do l = 1, num_fields
+          call mpp_update_domains( x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=BGRID_NE, &
+                                  complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id1)
+
+        !--- non-blocking update
+        call mpp_clock_begin(id2)
+        if( n == 1 ) then
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update =  mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, &
+                           gridtype=BGRID_NE, complete=.false., tile_count=1)
+            id_update =  mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, &
+                           gridtype=BGRID_NE, complete=l==num_fields, tile_count=1)
+          enddo
+        else
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, gridtype=BGRID_NE, &
+                              update_id=id_update, complete=.false., tile_count=1)
+            id_update = mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, gridtype=BGRID_NE, &
+                              update_id=id_update, complete=l==num_fields, tile_count=1)
+          enddo
+        endif
+        call mpp_clock_end(id2)
+        !---- sleep some time for non-blocking.
+        if(do_sleep) call sleep(1)
+
+        call mpp_clock_begin(id2)
+        do l = 1, num_fields
+          if(mix_2D_3D) call mpp_complete_update_domains(id_update, a1_2D(:,:,l), b1_2D(:,:,l), domain, &
+                                            gridtype=BGRID_NE, complete=.false., tile_count=1)
+          call mpp_complete_update_domains(id_update, a1(:,:,:,l), b1(:,:,:,l), domain, &
+                                            gridtype=BGRID_NE, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end(id2)
+
+        !--- compare checksum
+        do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), test_type//' BGRID X'//text)
+          call compare_checksums( y1(:,:,:,l), b1(:,:,:,l), test_type//' BGRID Y'//text)
+          if(mix_2D_3D) then
+            call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), test_type//' BGRID X'//text)
+            call compare_checksums( y1(:,:,:,1), b1(:,:,:,1), test_type//' BGRID Y'//text)
+          endif
+        enddo
+        if(mix_2D_3D) then
+          call compare_checksums( x1(:,:,1,:), a1_2D(:,:,:), test_type//' BGRID X 2D')
+          call compare_checksums( y1(:,:,1,:), b1_2D(:,:,:), test_type//' BGRID Y 2D')
+        endif
+      enddo
+      deallocate(x1, y1, a1, b1)
+      if(mix_2D_3D) deallocate(a1_2D, b1_2D)
+    endif
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=BGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end(id_single)
+
+    !--- compare checksums
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type//' BGRID X')
+    call compare_checksums( y(:,:,:,1), b(:,:,:,1), test_type//' BGRID Y')
+
+    deallocate(x, y, a, b, x_save, y_save)
+    !------------------------------------------------------------------
+    !              vector update : CGRID_NE, one extra point in each direction for cubic-grid
+    !------------------------------------------------------------------
+    allocate( x (ism:iem+shift,jsm:jem  ,nz,ntile_per_pe) )
+    allocate( y (ism:iem  ,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( a (ism:iem+shift,jsm:jem  ,nz,ntile_per_pe) )
+    allocate( b (ism:iem  ,jsm:jem+shift,nz,ntile_per_pe) )
+    allocate( x_save (ism:iem+shift,jsm:jem  ,nz,ntile_per_pe) )
+    allocate( y_save (ism:iem  ,jsm:jem+shift,nz,ntile_per_pe) )
+
+    x = 0
+    y = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec
+          do i = isc, iec+shift
+            x(i,j,k,l) = 1.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          end do
+        end do
+        do j = jsc, jec+shift
+          do i = isc, iec
+            y(i,j,k,l) = 2.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          end do
+        end do
+      end do
+    enddo
+
+    a  = x; b  = y
+    x_save  = x; y_save  = y
+
+    id1 = mpp_clock_id( trim(test_type)//' CGRID', flags=MPP_CLOCK_SYNC )
+    id_single = mpp_clock_id( trim(test_type)//' CGRID non-blocking', flags=MPP_CLOCK_SYNC )
+
+    call mpp_clock_begin(id1)
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, y, domain, gridtype=CGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end  (id1)
+
+    !--- non-blocking update
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      id_update_single =  mpp_start_update_domains(a, b, domain, gridtype=CGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+
+    !---- sleep some time for non-blocking.
+    if(do_sleep) call sleep(1)
+
+    id1 = mpp_clock_id( trim(test_type)//' CGRID group', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( trim(test_type)//' CGRID group non-blocking', flags=MPP_CLOCK_SYNC )
+
+    if(ntile_per_pe == 1) then
+      allocate( x1(ism:iem+shift,jsm:jem      ,nz,num_fields) )
+      allocate( y1(ism:iem      ,jsm:jem+shift,nz,num_fields) )
+      allocate( a1(ism:iem+shift,jsm:jem      ,nz,num_fields) )
+      allocate( b1(ism:iem      ,jsm:jem+shift,nz,num_fields) )
+      if(mix_2D_3D) then
+        allocate( a1_2D(ism:iem+shift,jsm:jem      ,num_fields) )
+        allocate( b1_2D(ism:iem      ,jsm:jem+shift,num_fields) )
+      endif
+
+      do n = 1, num_iter
+        do l = 1, num_fields
+          x1(:,:,:,l) = x_save(:,:,:,1)
+          a1(:,:,:,l) = x_save(:,:,:,1)
+          y1(:,:,:,l) = y_save(:,:,:,1)
+          b1(:,:,:,l) = y_save(:,:,:,1)
+          if(mix_2D_3D) then
+            a1_2D(:,:,l) = x_save(:,:,1,1)
+            b1_2D(:,:,l) = y_save(:,:,1,1)
+          endif
+        enddo
+
+        call mpp_clock_begin(id1)
+        do l = 1, num_fields
+          call mpp_update_domains(x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=CGRID_NE, &
+                                  complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end(id1)
+        !--- non-blocking update
+        call mpp_clock_begin(id2)
+        if( n == 1 ) then
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, &
+                     gridtype=CGRID_NE, complete=.false., tile_count=1)
+            id_update = mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, &
+                     gridtype=CGRID_NE, complete=l==num_fields, tile_count=1)
+          enddo
+        else
+          do l = 1, num_fields
+            if(mix_2D_3D)id_update = mpp_start_update_domains(a1_2D(:,:,l), b1_2D(:,:,l), domain, gridtype=CGRID_NE, &
+                          update_id=id_update, complete=.false., tile_count=1)
+            id_update = mpp_start_update_domains(a1(:,:,:,l), b1(:,:,:,l), domain, gridtype=CGRID_NE, &
+                          update_id=id_update, complete=l==num_fields, tile_count=1)
+          enddo
+        endif
+        call mpp_clock_end  (id2)
+
+        !---- sleep some time for non-blocking.
+        if(do_sleep) call sleep(1)
+
+        call mpp_clock_begin(id2)
+        do l = 1, num_fields
+           if(mix_2D_3D)call mpp_complete_update_domains(id_update, a1_2D(:,:,l), b1_2D(:,:,l), domain, &
+                 gridtype=CGRID_NE, complete=.false., tile_count=1)
+           call mpp_complete_update_domains(id_update, a1(:,:,:,l), b1(:,:,:,l), domain, &
+                 gridtype=CGRID_NE, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id2)
+
+        !--- compare checksum
+        do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums( x1(:,:,:,l), a1(:,:,:,l), test_type//' CGRID X'//text)
+          call compare_checksums( y1(:,:,:,l), b1(:,:,:,l), test_type//' CGRID Y'//text)
+        enddo
+        if(mix_2D_3D) then
+          call compare_checksums( x1(:,:,1,:), a1_2D(:,:,:), test_type//' BGRID X 2D')
+          call compare_checksums( y1(:,:,1,:), b1_2D(:,:,:), test_type//' BGRID Y 2D')
+        endif
+      enddo
+      deallocate(x1, y1, a1, b1)
+      if(mix_2D_3D) deallocate(a1_2D, b1_2D)
+    endif
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=CGRID_NE, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+
+    !--- compare checksum
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type//' CGRID X')
+    call compare_checksums( y(:,:,:,1), b(:,:,:,1), test_type//' CGRID Y')
+
+    deallocate(x, y, a, b, x_save, y_save)
+
+    !------------------------------------------------------------------
+    !              vector update : AGRID vector and scalar pair
+    !------------------------------------------------------------------
+    allocate( x (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( y (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( a (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( b (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( x_save (ism:iem,jsm:jem,nz,ntile_per_pe) )
+    allocate( y_save (ism:iem,jsm:jem,nz,ntile_per_pe) )
+
+
+    x = 0
+    y = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec
+          do i = isc, iec+shift
+            x(i,j,k,l) = 1.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          end do
+        end do
+        do j = jsc, jec+shift
+          do i = isc, iec
+            y(i,j,k,l) = 2.0e3 + tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          end do
+        end do
+      end do
+    enddo
+
+    a  = x; b  = y
+    x_save  = x; y_save  = y
+    ! blocking update
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, y, domain, gridtype=AGRID, tile_count=l)
+
+      id_update_single = mpp_start_update_domains(a, b, domain, gridtype=AGRID, tile_count=l)
+      call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=AGRID, tile_count=l)
+    enddo
+    !--- compare checksum
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type//' AGRID X')
+    call compare_checksums( y(:,:,:,1), b(:,:,:,1), test_type//' AGRID Y')
+
+    x = x_save; y = y_save
+    a = x_save; b = y_save
+    ! blocking update
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, y, domain, gridtype=AGRID, flags = SCALAR_PAIR, tile_count=l)
+
+      id_update_single = mpp_start_update_domains(a, b, domain, gridtype=AGRID, &
+                                                  flags = SCALAR_PAIR, tile_count=l)
+      call mpp_complete_update_domains(id_update_single, a, b, domain, gridtype=AGRID, &
+                                       flags = SCALAR_PAIR, tile_count=l)
+    enddo
+    !--- compare checksums
+    call compare_checksums( x(:,:,:,1), a(:,:,:,1), test_type//' AGRID SCALAR-PAIR X')
+    call compare_checksums( y(:,:,:,1), b(:,:,:,1), test_type//' AGRID SCALAR-PAIR Y')
+
+    deallocate(x, y, a, b, x_save, y_save)
+
+    nx = nx_save
+    ny = ny_save
+
+    deallocate(layout2D, global_indices, pe_start, pe_end, tile1, tile2)
+    deallocate(istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2 )
+
+  end subroutine update_domains_performance_r4
+
+ !> run performance tests on 64-bit integer arrays
+  subroutine update_domains_performance_i8( test_type )
+    character(len=*), intent(in) :: test_type
+
+    type(domain2D) :: domain
+    integer        :: num_contact, ntiles, npes_per_tile, ntile_per_pe
+    integer        :: l, n, shift
+    integer(kind=i8_kind) :: i, j, k ! used to define the data arrays
+    integer        :: ism, iem, jsm, jem
+    integer        :: isc, iec, jsc, jec, isd, ied, jsd, jed
+
+    integer, allocatable, dimension(:)       :: tile
+    integer, allocatable, dimension(:)       :: pe_start, pe_end, tile1, tile2
+    integer, allocatable, dimension(:)       :: istart1, iend1, jstart1, jend1
+    integer, allocatable, dimension(:)       :: istart2, iend2, jstart2, jend2
+    integer, allocatable, dimension(:,:)     :: layout2D, global_indices
+    integer(kind=i8_kind),    allocatable, dimension(:,:,:,:) :: x, x1, x_save
+    integer(kind=i8_kind),    allocatable, dimension(:,:,:,:) :: a, a1
+    integer(kind=i8_kind),     allocatable, dimension(:,:,:  ) :: a1_2D
+    integer            :: id_update
+    integer            :: id1, id2
+    logical            :: folded_north
+    logical            :: cubic_grid, single_tile
+    character(len=3)   :: text
+    integer            :: nx_save, ny_save
+    integer            :: id_single, id_update_single
+
+    folded_north       = .false.
+    cubic_grid         = .false.
+    single_tile        = .false.
+    nx_save = nx
+    ny_save = ny
+    !--- check the test_type
+    select case(test_type)
+      case ( 'Single-Tile' )   !--- single with cyclic along x- and y-direction
+        single_tile = .true.
+        ntiles = 1
+        num_contact = 2
+      case ( 'Folded-north' )
+        ntiles = 1
+        num_contact = 2
+        folded_north = .true.
+      case ( 'Cubic-Grid' )
+        if( nx_cubic == 0 ) then
+          call mpp_error(NOTE,'update_domains_performance_r8: for Cubic_grid mosaic, nx_cubic is zero, '//&
+                        'No test is done for Cubic-Grid mosaic. ' )
+          return
+        endif
+        if( nx_cubic .NE. ny_cubic ) then
+          call mpp_error(NOTE,'update_domains_performance_r8: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
+                        'No test is done for Cubic-Grid mosaic. ' )
+          return
+        endif
+
+        nx = nx_cubic
+        ny = ny_cubic
+        ntiles = 6
+        num_contact = 12
+        cubic_grid = .true.
+
+      case default
+        call mpp_error(FATAL, 'update_domains_performancez_r8: no such test: '//test_type)
+    end select
+
+    allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
+    if( mod(npes, ntiles) == 0 ) then
+      npes_per_tile = npes/ntiles
+      text=""
+      write(text,"(I3)") npes_per_tile
+      call mpp_error(NOTE, 'update_domains_performance_r8: For Mosaic "'//trim(test_type)// &
+                    '", each tile will be distributed over '//text//' processors.')
+      ntile_per_pe = 1
+      allocate(tile(ntile_per_pe))
+      tile = pe/npes_per_tile+1
+      if(cubic_grid) then
+        call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+      else
+        call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+      endif
+      do n = 1, ntiles
+        pe_start(n) = (n-1)*npes_per_tile
+        pe_end(n)   = n*npes_per_tile-1
+      end do
+    else if ( mod(ntiles, npes) == 0 ) then
+      ntile_per_pe = ntiles/npes
+      text=""
+      write(text,"(I3)") ntile_per_pe
+      call mpp_error(NOTE, 'update_domains_performance_r8: For Mosaic "'//trim(test_type)// &
+                      '", there will be '//text//' tiles on each processor.')
+      allocate(tile(ntile_per_pe))
+      do n = 1, ntile_per_pe
+        tile(n) = pe*ntile_per_pe + n
+      end do
+      do n = 1, ntiles
+        pe_start(n) = (n-1)/ntile_per_pe
+        pe_end(n)   = pe_start(n)
+      end do
+      layout = 1
+    else
+      call mpp_error(NOTE,'update_domains_performance_r8: npes should be multiple of ntiles or ' // &
+          'ntiles should be multiple of npes. No test is done for '//trim(test_type) )
+      return
+    end if
+
+    do n = 1, ntiles
+      global_indices(:,n) = (/1,nx,1,ny/)
+      layout2D(:,n)         = layout
+    end do
+
+    allocate(tile1(num_contact), tile2(num_contact) )
+    allocate(istart1(num_contact), iend1(num_contact), jstart1(num_contact), jend1(num_contact) )
+    allocate(istart2(num_contact), iend2(num_contact), jstart2(num_contact), jend2(num_contact) )
+
+    !--- define domain
+    if(single_tile) then
+      !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)
+      tile1(1) = 1; tile2(1) = 1
+      istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
+      istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
+      !--- Contact line 2, between tile 1 (SOUTH) and tile 1 (NORTH)  --- cyclic
+      tile1(2) = 1; tile2(2) = 1
+      istart1(2) = 1;  iend1(2) = nx; jstart1(2) = 1;   jend1(2) = 1
+      istart2(2) = 1;  iend2(2) = nx; jstart2(2) = ny;  jend2(2) = ny
+      call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
+          istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+          pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+          name = test_type, symmetry = .false. )
+    else if(folded_north) then
+      !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)  --- cyclic
+      tile1(1) = 1; tile2(1) = 1
+      istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
+      istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
+      !--- Contact line 2, between tile 1 (NORTH) and tile 1 (NORTH)  --- folded-north-edge
+      tile1(2) = 1; tile2(2) = 1
+      istart1(2) = 1;  iend1(2) = nx/2;   jstart1(2) = ny;  jend1(2) = ny
+      istart2(2) = nx; iend2(2) = nx/2+1; jstart2(2) = ny;  jend2(2) = ny
+      call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
+                            istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+                            pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+                            name = test_type, symmetry = .false.  )
+    else if( cubic_grid ) then
+      call define_cubic_mosaic(test_type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
+                              global_indices, layout2D, pe_start, pe_end )
+    endif
+
+    !--- setup data
+    call mpp_get_compute_domain( domain, isc, iec, jsc, jec )
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+    call mpp_get_memory_domain   ( domain, ism, iem, jsm, jem )
+    allocate( x (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    allocate( x_save (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    allocate( a (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    x = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec
+          do i = isc, iec
+              x(i, j, k, l) = tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          enddo
+        enddo
+      enddo
+    enddo
+
+    a  = x
+    x_save = x
+
+    if(num_fields<1) then
+      call mpp_error(FATAL, "update_domains_performanc_r8: num_fields must be a positive integer")
+    endif
+
+    id1 = mpp_clock_id( test_type, flags=MPP_CLOCK_SYNC)
+    id_single = mpp_clock_id( test_type//' non-blocking', flags=MPP_CLOCK_SYNC)
+
+    call mpp_clock_begin(id1)
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id1)
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      id_update_single =  mpp_start_update_domains(a, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+
+    !---- sleep some time for non-blocking.
+    if(do_sleep) call sleep(1)
+
+    id1 = mpp_clock_id( test_type//' group', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( test_type//' group non-blocking', flags=MPP_CLOCK_SYNC )
+
+    if(ntile_per_pe == 1) then
+      allocate( x1(ism:iem,jsm:jem,nz, num_fields) )
+      allocate( a1(ism:iem,jsm:jem,nz, num_fields) )
+      if(mix_2D_3D) allocate( a1_2D(ism:iem,jsm:jem,num_fields) )
+
+      do n = 1, num_iter
+        do l = 1, num_fields
+          x1(:,:,:,l) = x_save(:,:,:,1)
+          a1(:,:,:,l) = x_save(:,:,:,1)
+          if(mix_2D_3D) a1_2D(:,:,l) = x_save(:,:,1,1)
+        enddo
+
+        call mpp_clock_begin(id1)
+        do l = 1, num_fields
+          call mpp_update_domains( x1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id1)
+
+        ! non-blocking update
+        call mpp_clock_begin(id2)
+        if( n == 1 ) then
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update =  mpp_start_update_domains(a1_2D(:,:,l), domain, complete=.false., tile_count=1)
+            id_update =  mpp_start_update_domains(a1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+          enddo
+        else
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), domain, &
+              update_id=id_update, complete=.false., tile_count=1)
+              id_update = mpp_start_update_domains(a1(:,:,:,l), domain, update_id=id_update, &
+                                                   complete=l==num_fields, tile_count=1)
+          enddo
+        endif
+        call mpp_clock_end  (id2)
+
+        !---- sleep some time for non-blocking.
+        if(do_sleep) call sleep(1)
+
+        call mpp_clock_begin(id2)
+        do l = 1, num_fields
+          if(mix_2D_3D) call mpp_complete_update_domains(id_update, a1_2D(:,:,l), domain, &
+                                                         complete=.false., tile_count=1)
+          call mpp_complete_update_domains(id_update, a1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id2)
+
+        !--- compare checksum
+        do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums_int( x1(:,:,:,l), a1(:,:,:,l), test_type//' X'//text)
+        enddo
+        if(mix_2D_3D)call compare_checksums_int( x1(:,:,1,:), a1_2D(:,:,:), test_type//' X 2D')
+      enddo
+      deallocate(x1, a1)
+      if(mix_2D_3D) deallocate(a1_2D)
+    endif
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      call mpp_complete_update_domains(id_update_single, a, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+    call compare_checksums_int( x(:,:,:,1), a(:,:,:,1), test_type)
+
+    deallocate(x, a, x_save)
+    deallocate(layout2D, global_indices, pe_start, pe_end, tile1, tile2)
+    deallocate(istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2 )
+
+  end subroutine update_domains_performance_i8
+
+  !> run performance tests on 32-bit integer arrays
+  subroutine update_domains_performance_i4( test_type )
+    character(len=*), intent(in) :: test_type
+
+    type(domain2D) :: domain
+    integer        :: num_contact, ntiles, npes_per_tile, ntile_per_pe
+    integer        :: l, n, shift
+    integer(kind=i4_kind) :: i, j, k ! used to define the data arrays
+    integer        :: ism, iem, jsm, jem
+    integer        :: isc, iec, jsc, jec, isd, ied, jsd, jed
+
+    integer, allocatable, dimension(:)       :: tile
+    integer, allocatable, dimension(:)       :: pe_start, pe_end, tile1, tile2
+    integer, allocatable, dimension(:)       :: istart1, iend1, jstart1, jend1
+    integer, allocatable, dimension(:)       :: istart2, iend2, jstart2, jend2
+    integer, allocatable, dimension(:,:)     :: layout2D, global_indices
+    integer(kind=i4_kind),    allocatable, dimension(:,:,:,:) :: x, x1, x_save
+    integer(kind=i4_kind),    allocatable, dimension(:,:,:,:) :: a, a1
+    integer(kind=i4_kind),    allocatable, dimension(:,:,:  ) :: a1_2D
+    integer            :: id_update
+    integer            :: id1, id2
+    logical            :: folded_north
+    logical            :: cubic_grid, single_tile
+    character(len=3)   :: text
+    integer            :: nx_save, ny_save
+    integer            :: id_single, id_update_single
+
+    folded_north       = .false.
+    cubic_grid         = .false.
+    single_tile        = .false.
+    nx_save = nx
+    ny_save = ny
+    !--- check the test_type
+    select case(test_type)
+      case ( 'Single-Tile' )   !--- single with cyclic along x- and y-direction
+        single_tile = .true.
+        ntiles = 1
+        num_contact = 2
+      case ( 'Folded-north' )
+        ntiles = 1
+        num_contact = 2
+        folded_north = .true.
+      case ( 'Cubic-Grid' )
+        if( nx_cubic == 0 ) then
+          call mpp_error(NOTE,'update_domains_performance_r8: for Cubic_grid mosaic, nx_cubic is zero, '//&
+                        'No test is done for Cubic-Grid mosaic. ' )
+          return
+        endif
+        if( nx_cubic .NE. ny_cubic ) then
+          call mpp_error(NOTE,'update_domains_performance_r8: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
+                        'No test is done for Cubic-Grid mosaic. ' )
+          return
+        endif
+
+        nx = nx_cubic
+        ny = ny_cubic
+        ntiles = 6
+        num_contact = 12
+        cubic_grid = .true.
+
+      case default
+        call mpp_error(FATAL, 'update_domains_performancez_r8: no such test: '//test_type)
+    end select
+
+    allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
+    if( mod(npes, ntiles) == 0 ) then
+      npes_per_tile = npes/ntiles
+      text=""
+      write(text,"(I3)") npes_per_tile
+      call mpp_error(NOTE, 'update_domains_performance_r8: For Mosaic "'//trim(test_type)// &
+                    '", each tile will be distributed over '//text//' processors.')
+      ntile_per_pe = 1
+      allocate(tile(ntile_per_pe))
+      tile = pe/npes_per_tile+1
+      if(cubic_grid) then
+        call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+      else
+        call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+      endif
+      do n = 1, ntiles
+        pe_start(n) = (n-1)*npes_per_tile
+        pe_end(n)   = n*npes_per_tile-1
+      end do
+    else if ( mod(ntiles, npes) == 0 ) then
+      ntile_per_pe = ntiles/npes
+      text=""
+      write(text,"(I3)") ntile_per_pe
+      call mpp_error(NOTE, 'update_domains_performance_r8: For Mosaic "'//trim(test_type)// &
+                      '", there will be '//text//' tiles on each processor.')
+      allocate(tile(ntile_per_pe))
+      do n = 1, ntile_per_pe
+        tile(n) = pe*ntile_per_pe + n
+      end do
+      do n = 1, ntiles
+        pe_start(n) = (n-1)/ntile_per_pe
+        pe_end(n)   = pe_start(n)
+      end do
+      layout = 1
+    else
+      call mpp_error(NOTE,'update_domains_performance_r8: npes should be multiple of ntiles or ' // &
+          'ntiles should be multiple of npes. No test is done for '//trim(test_type) )
+      return
+    end if
+
+    do n = 1, ntiles
+      global_indices(:,n) = (/1,nx,1,ny/)
+      layout2D(:,n)         = layout
+    end do
+
+    allocate(tile1(num_contact), tile2(num_contact) )
+    allocate(istart1(num_contact), iend1(num_contact), jstart1(num_contact), jend1(num_contact) )
+    allocate(istart2(num_contact), iend2(num_contact), jstart2(num_contact), jend2(num_contact) )
+
+    !--- define domain
+    if(single_tile) then
+      !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)
+      tile1(1) = 1; tile2(1) = 1
+      istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
+      istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
+      !--- Contact line 2, between tile 1 (SOUTH) and tile 1 (NORTH)  --- cyclic
+      tile1(2) = 1; tile2(2) = 1
+      istart1(2) = 1;  iend1(2) = nx; jstart1(2) = 1;   jend1(2) = 1
+      istart2(2) = 1;  iend2(2) = nx; jstart2(2) = ny;  jend2(2) = ny
+      call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
+          istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+          pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+          name = test_type, symmetry = .false. )
+    else if(folded_north) then
+      !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)  --- cyclic
+      tile1(1) = 1; tile2(1) = 1
+      istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
+      istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
+      !--- Contact line 2, between tile 1 (NORTH) and tile 1 (NORTH)  --- folded-north-edge
+      tile1(2) = 1; tile2(2) = 1
+      istart1(2) = 1;  iend1(2) = nx/2;   jstart1(2) = ny;  jend1(2) = ny
+      istart2(2) = nx; iend2(2) = nx/2+1; jstart2(2) = ny;  jend2(2) = ny
+      call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
+                            istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+                            pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+                            name = test_type, symmetry = .false.  )
+    else if( cubic_grid ) then
+      call define_cubic_mosaic(test_type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
+                              global_indices, layout2D, pe_start, pe_end )
+    endif
+
+    !--- setup data
+    call mpp_get_compute_domain( domain, isc, iec, jsc, jec )
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+    call mpp_get_memory_domain   ( domain, ism, iem, jsm, jem )
+    allocate( x (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    allocate( x_save (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    allocate( a (ism:iem,jsm:jem,nz, ntile_per_pe) )
+    x = 0
+    do l = 1, ntile_per_pe
+      do k = 1, nz
+        do j = jsc, jec
+          do i = isc, iec
+              x(i, j, k, l) = tile(l) + i*1.0e-3 + j*1.0e-6 + k*1.0e-9
+          enddo
+        enddo
+      enddo
+    enddo
+
+    a  = x
+    x_save = x
+
+    if(num_fields<1) then
+      call mpp_error(FATAL, "update_domains_performanc_r8: num_fields must be a positive integer")
+    endif
+
+    id1 = mpp_clock_id( test_type, flags=MPP_CLOCK_SYNC)
+    id_single = mpp_clock_id( test_type//' non-blocking', flags=MPP_CLOCK_SYNC)
+
+    call mpp_clock_begin(id1)
+    do l=1,ntile_per_pe
+      call mpp_update_domains( x, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id1)
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      id_update_single =  mpp_start_update_domains(a, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+
+    !---- sleep some time for non-blocking.
+    if(do_sleep) call sleep(1)
+
+    id1 = mpp_clock_id( test_type//' group', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( test_type//' group non-blocking', flags=MPP_CLOCK_SYNC )
+
+    if(ntile_per_pe == 1) then
+      allocate( x1(ism:iem,jsm:jem,nz, num_fields) )
+      allocate( a1(ism:iem,jsm:jem,nz, num_fields) )
+      if(mix_2D_3D) allocate( a1_2D(ism:iem,jsm:jem,num_fields) )
+
+      do n = 1, num_iter
+        do l = 1, num_fields
+          x1(:,:,:,l) = x_save(:,:,:,1)
+          a1(:,:,:,l) = x_save(:,:,:,1)
+          if(mix_2D_3D) a1_2D(:,:,l) = x_save(:,:,1,1)
+        enddo
+
+        call mpp_clock_begin(id1)
+        do l = 1, num_fields
+          call mpp_update_domains( x1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id1)
+
+        ! non-blocking update
+        call mpp_clock_begin(id2)
+        if( n == 1 ) then
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), domain, complete=.false., tile_count=1)
+            id_update =  mpp_start_update_domains(a1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+          enddo
+        else
+          do l = 1, num_fields
+            if(mix_2D_3D) id_update = mpp_start_update_domains(a1_2D(:,:,l), domain, &
+              update_id=id_update, complete=.false., tile_count=1)
+              id_update = mpp_start_update_domains(a1(:,:,:,l), domain, update_id=id_update, &
+                                                   complete=l==num_fields, tile_count=1)
+          enddo
+        endif
+        call mpp_clock_end  (id2)
+
+        !---- sleep some time for non-blocking.
+        if(do_sleep) call sleep(1)
+
+        call mpp_clock_begin(id2)
+        do l = 1, num_fields
+          if(mix_2D_3D) call mpp_complete_update_domains(id_update, a1_2D(:,:,l), domain, &
+                                                         complete=.false., tile_count=1)
+          call mpp_complete_update_domains(id_update, a1(:,:,:,l), domain, complete=l==num_fields, tile_count=1)
+        enddo
+        call mpp_clock_end  (id2)
+
+
+        !--- compare checksum
+        do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums_int( x1(:,:,:,l), a1(:,:,:,l), test_type//' X'//text)
+        enddo
+        if(mix_2D_3D)call compare_checksums_int( x1(:,:,1,:), a1_2D(:,:,:), test_type//' X 2D')
+      enddo
+      deallocate(x1, a1)
+      if(mix_2D_3D) deallocate(a1_2D)
+    endif
+
+    call mpp_clock_begin(id_single)
+    do l=1,ntile_per_pe
+      call mpp_complete_update_domains(id_update_single, a, domain, tile_count=l)
+    enddo
+    call mpp_clock_end  (id_single)
+    call compare_checksums_int( x(:,:,:,1), a(:,:,:,1), test_type)
+
+    deallocate(x, a, x_save)
+    deallocate(layout2D, global_indices, pe_start, pe_end, tile1, tile2)
+    deallocate(istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2 )
+
+  end subroutine update_domains_performance_i4
+
+  !> define mosaic domain for cubic grid
+  subroutine define_cubic_mosaic(type, domain, ni, nj, global_indices, layout, pe_start, pe_end, use_memsize)
+    character(len=*), intent(in)  :: type
+    type(domain2d), intent(inout) :: domain
+    integer,        intent(in)    :: global_indices(:,:), layout(:,:)
+    integer,        intent(in)    :: ni(:), nj(:)
+    integer,        intent(in)    :: pe_start(:), pe_end(:)
+    logical, optional, intent(in) :: use_memsize
+    integer, dimension(12)        :: istart1, iend1, jstart1, jend1, tile1
+    integer, dimension(12)        :: istart2, iend2, jstart2, jend2, tile2
+    integer                       :: ntiles, num_contact, msize(2)
+    logical                       :: use_memsize_local
+
+    use_memsize_local = .true.
+    if(present(use_memsize)) use_memsize_local = use_memsize
+
+    ntiles = 6
+    num_contact = 12
+    if(size(pe_start(:)) .NE. 6 .OR. size(pe_end(:)) .NE. 6 ) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of pe_start and pe_end should be 6")
+    if(size(global_indices,1) .NE. 4) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of first dimension of global_indices should be 4")
+    if(size(global_indices,2) .NE. 6) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of second dimension of global_indices should be 6")
+    if(size(layout,1) .NE. 2) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of first dimension of layout should be 2")
+    if(size(layout,2) .NE. 6) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of second dimension of layout should be 6")
+    if(size(ni(:)) .NE. 6 .OR. size(nj(:)) .NE. 6) call mpp_error(FATAL, &
+         "define_cubic_mosaic: size of ni and nj should be 6")
+
+    !--- Contact line 1, between tile 1 (EAST) and tile 2 (WEST)
+    tile1(1) = 1; tile2(1) = 2
+    istart1(1) = ni(1);  iend1(1) = ni(1);  jstart1(1) = 1;      jend1(1) = nj(1)
+    istart2(1) = 1;      iend2(1) = 1;      jstart2(1) = 1;      jend2(1) = nj(2)
+    !--- Contact line 2, between tile 1 (NORTH) and tile 3 (WEST)
+    tile1(2) = 1; tile2(2) = 3
+    istart1(2) = 1;      iend1(2) = ni(1);  jstart1(2) = nj(1);  jend1(2) = nj(1)
+    istart2(2) = 1;      iend2(2) = 1;      jstart2(2) = nj(3);  jend2(2) = 1
+    !--- Contact line 3, between tile 1 (WEST) and tile 5 (NORTH)
+    tile1(3) = 1; tile2(3) = 5
+    istart1(3) = 1;      iend1(3) = 1;      jstart1(3) = 1;      jend1(3) = nj(1)
+    istart2(3) = ni(5);  iend2(3) = 1;      jstart2(3) = nj(5);  jend2(3) = nj(5)
+    !--- Contact line 4, between tile 1 (SOUTH) and tile 6 (NORTH)
+    tile1(4) = 1; tile2(4) = 6
+    istart1(4) = 1;      iend1(4) = ni(1);  jstart1(4) = 1;      jend1(4) = 1
+    istart2(4) = 1;      iend2(4) = ni(6);  jstart2(4) = nj(6);  jend2(4) = nj(6)
+    !--- Contact line 5, between tile 2 (NORTH) and tile 3 (SOUTH)
+    tile1(5) = 2; tile2(5) = 3
+    istart1(5) = 1;      iend1(5) = ni(2);  jstart1(5) = nj(2);  jend1(5) = nj(2)
+    istart2(5) = 1;      iend2(5) = ni(3);  jstart2(5) = 1;      jend2(5) = 1
+    !--- Contact line 6, between tile 2 (EAST) and tile 4 (SOUTH)
+    tile1(6) = 2; tile2(6) = 4
+    istart1(6) = ni(2);  iend1(6) = ni(2);  jstart1(6) = 1;      jend1(6) = nj(2)
+    istart2(6) = ni(4);  iend2(6) = 1;      jstart2(6) = 1;      jend2(6) = 1
+    !--- Contact line 7, between tile 2 (SOUTH) and tile 6 (EAST)
+    tile1(7) = 2; tile2(7) = 6
+    istart1(7) = 1;      iend1(7) = ni(2);  jstart1(7) = 1;      jend1(7) = 1
+    istart2(7) = ni(6);  iend2(7) = ni(6);  jstart2(7) = nj(6);  jend2(7) = 1
+    !--- Contact line 8, between tile 3 (EAST) and tile 4 (WEST)
+    tile1(8) = 3; tile2(8) = 4
+    istart1(8) = ni(3);  iend1(8) = ni(3);  jstart1(8) = 1;      jend1(8) = nj(3)
+    istart2(8) = 1;      iend2(8) = 1;      jstart2(8) = 1;      jend2(8) = nj(4)
+    !--- Contact line 9, between tile 3 (NORTH) and tile 5 (WEST)
+    tile1(9) = 3; tile2(9) = 5
+    istart1(9) = 1;      iend1(9) = ni(3);  jstart1(9) = nj(3);  jend1(9) = nj(3)
+    istart2(9) = 1;      iend2(9) = 1;      jstart2(9) = nj(5);  jend2(9) = 1
+    !--- Contact line 10, between tile 4 (NORTH) and tile 5 (SOUTH)
+    tile1(10) = 4; tile2(10) = 5
+    istart1(10) = 1;     iend1(10) = ni(4); jstart1(10) = nj(4); jend1(10) = nj(4)
+    istart2(10) = 1;     iend2(10) = ni(5); jstart2(10) = 1;     jend2(10) = 1
+    !--- Contact line 11, between tile 4 (EAST) and tile 6 (SOUTH)
+    tile1(11) = 4; tile2(11) = 6
+    istart1(11) = ni(4); iend1(11) = ni(4); jstart1(11) = 1;     jend1(11) = nj(4)
+    istart2(11) = ni(6); iend2(11) = 1;     jstart2(11) = 1;     jend2(11) = 1
+    !--- Contact line 12, between tile 5 (EAST) and tile 6 (WEST)
+    tile1(12) = 5; tile2(12) = 6
+    istart1(12) = ni(5); iend1(12) = ni(5); jstart1(12) = 1;     jend1(12) = nj(5)
+    istart2(12) = 1;     iend2(12) = 1;     jstart2(12) = 1;     jend2(12) = nj(6)
+    msize(1) = maxval(ni(:)/layout(1,:)) + whalo + ehalo + 1 ! make sure memory domain size is no smaller than
+    msize(2) = maxval(nj(:)/layout(2,:)) + shalo + nhalo + 1 ! data domain size
+
+    if(use_memsize_local) then
+       call mpp_define_mosaic(global_indices, layout, domain, ntiles, num_contact, tile1, tile2, &
+         istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+         pe_start, pe_end, symmetry = .true., whalo=whalo, ehalo=ehalo,   &
+         shalo=shalo, nhalo=nhalo, name = trim(type), memory_size = msize  )
+    else
+       call mpp_define_mosaic(global_indices, layout, domain, ntiles, num_contact, tile1, tile2, &
+         istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+         pe_start, pe_end, symmetry = .true., whalo=whalo, ehalo=ehalo,   &
+         shalo=shalo, nhalo=nhalo, name = trim(type) )
+    endif
+
+    return
+
+  end subroutine define_cubic_mosaic
+
+end program test_update_domains_performance
diff --git a/test_fms/mpp/test_update_domains_performance.sh b/test_fms/mpp/test_update_domains_performance.sh
new file mode 100755
index 0000000000..5388556959
--- /dev/null
+++ b/test_fms/mpp/test_update_domains_performance.sh
@@ -0,0 +1,44 @@
+#!/bin/sh
+
+#***********************************************************************
+#                   GNU Lesser General Public License
+#
+# This file is part of the GFDL Flexible Modeling System (FMS).
+#
+# FMS is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# FMS is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Jessica Liptak
+
+# Set common test settings.
+. ../test_common.sh
+# Run the test for one processor
+echo "Running test_update_domains_performance with 1 pe"
+run_test test_update_domains_performance 1
+# If on a Linux system that uses the command `nproc`, run the test
+if [ $(command -v nproc) ]
+ # Looks like a linux system
+ then
+   # Get the number of available CPUs on the system
+   nProc=$(nproc)
+   if [ ${nProc} -ge 6 ]
+     then
+       # Run the test with 2 pes
+       echo "Running test_update_domains_performance with 6 pes"
+       run_test test_update_domains_performance 6
+   fi
+fi
\ No newline at end of file
diff --git a/test_fms/mpp_io/Makefile.am b/test_fms/mpp_io/Makefile.am
index 817a8bd07b..224507e335 100644
--- a/test_fms/mpp_io/Makefile.am
+++ b/test_fms/mpp_io/Makefile.am
@@ -20,25 +20,31 @@
 # This is an automake file for the test_fms/mpp_io directory of the
 # FMS package.
 
-# uramirez, Ed Hartnett
+# uramirez, Ed Hartnett, Ryan Mulhall
 
 # Find the fms_mod.mod file.
-AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_srcdir}/include -I${top_builddir}/mpp -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
 
 # Build this test program.
-check_PROGRAMS = test_mpp_io
+check_PROGRAMS = test_mpp_io \
+  test_io_R4_R8              \
+  test_io_mosaic_R4_R8
 
 # This is the source code for the test.
 test_mpp_io_SOURCES = test_mpp_io.F90
+test_io_R4_R8_SOURCES = test_io_R4_R8.F90
+test_io_mosaic_R4_R8_SOURCES = test_io_mosaic_R4_R8.F90
 
 # Run the test program.
-TESTS = test_mpp_io2.sh
+TESTS = test_mpp_io2.sh \
+  test_io_R4_R8.sh      \
+  test_io_mosaic_R4_R8.sh
 
 # These files will also be distributed.
-EXTRA_DIST = test_mpp_io2.sh input_base.nml
+EXTRA_DIST = test_mpp_io2.sh test_io_R4_R8.sh test_io_mosaic_R4_R8.sh input_base.nml
 
 # Clean up
 CLEANFILES = input.nml *.nc* *.out
diff --git a/test_fms/mpp_io/input_base.nml b/test_fms/mpp_io/input_base.nml
old mode 100644
new mode 100755
diff --git a/test_fms/mpp_io/test_io_R4_R8.F90 b/test_fms/mpp_io/test_io_R4_R8.F90
new file mode 100644
index 0000000000..82afdbc6b6
--- /dev/null
+++ b/test_fms/mpp_io/test_io_R4_R8.F90
@@ -0,0 +1,533 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @file
+!> @brief unit test for mpp_write and mpp_read
+!> @email gfdl.climate.model.info@noaa.gov
+!> @description Tests mpp_write and mpp_read for reads/writes
+!>  with mixed precision reals on non-mosaic files
+program test_io_R4_R8
+
+  use platform_mod,    only : r4_kind, r8_kind, i8_kind
+  use mpp_mod,         only : mpp_init, mpp_pe, mpp_npes, mpp_root_pe, mpp_error, mpp_sync_self
+  use mpp_mod,         only : FATAL, NOTE, mpp_chksum, MPP_DEBUG, mpp_set_stack_size, MPP_CLOCK_SYNC
+  use mpp_mod,         only : mpp_sync, mpp_exit, mpp_clock_begin, mpp_clock_end, mpp_clock_id
+  use mpp_domains_mod, only : mpp_define_domains, mpp_domains_set_stack_size, domain1D, mpp_get_global_domain
+  use mpp_domains_mod, only : domain2D, mpp_define_layout, mpp_get_domain_components, mpp_define_mosaic
+  use mpp_domains_mod, only : mpp_get_memory_domain, mpp_get_compute_domain, mpp_domains_exit
+  use mpp_domains_mod, only : CENTER, EAST, NORTH, CORNER, mpp_get_data_domain
+  use mpp_domains_mod, only : mpp_define_io_domain, mpp_deallocate_domain
+  use mpp_io_mod,      only : mpp_io_init, mpp_write_meta, axistype, fieldtype, atttype
+  use mpp_io_mod,      only : MPP_RDONLY, mpp_open, MPP_OVERWR, MPP_ASCII, MPP_SINGLE
+  use mpp_io_mod,      only : MPP_NETCDF, MPP_MULTI, mpp_get_atts, mpp_write, mpp_close
+  use mpp_io_mod,      only : mpp_get_info, mpp_get_axes, mpp_get_fields, mpp_get_times
+  use mpp_io_mod,      only : mpp_read, mpp_io_exit, MPP_APPEND
+
+#ifdef INTERNAL_FILE_NML
+  USE mpp_mod, ONLY: input_nml_file
+#endif
+
+  implicit none
+
+#ifdef use_netCDF
+#include <netcdf.inc>
+#endif
+
+  !--- namelist definition
+  integer           :: nx=360, ny=200, nz=50, nt=2
+  integer           :: halo=2, stackmax=1500000, stackmaxd=2000000
+  logical           :: debug=.FALSE.
+  character(len=64) :: file='test', iospec='-F cachea'
+  integer           :: layout(2) = (/1,1/)
+  integer           :: ntiles_x=1, ntiles_y=1  ! total number of tiles will be ntiles_x*ntiles_y,
+                                               ! the grid size for each tile will be (nx/ntiles_x, ny/ntiles_y)
+                                               ! set ntiles > 1 to test the efficiency of mpp_io.
+  integer           :: io_layout(2) = (/1,1/)  ! set io_layout to divide each tile into io_layout(1)*io_layout(2)
+                                               ! group and write out data from the root pe of each group.
+  integer           :: pack_size = 1
+
+  namelist / test_mpp_io_R8_nml / nx, ny, nz, nt, halo, stackmax, stackmaxd, debug, file, iospec, &
+                               ntiles_x, ntiles_y, layout, io_layout
+
+  integer        :: pe, npes, io_status
+  type(domain2D) :: domain
+
+  integer            :: tks_per_sec
+  integer            :: i,j,k, unit=7
+  integer            :: id_single_tile_mult_file
+  integer            :: id_mult_tile, id_single_tile_with_group, id_mult_tile_with_group
+  logical            :: opened
+  character(len=64)  :: varname
+
+  real(r4_kind)               :: time4
+  real(r8_kind)              :: time8
+  type(axistype)     :: x, y, z, t
+  type(fieldtype)    :: f
+  type(domain1D)     :: xdom, ydom
+  integer(i8_kind) :: rchk, chk
+  real(r8_kind)                  :: doubledata = 0.0
+  real(r8_kind)                  :: realarray(4)
+
+! initialize modules and set up namelist
+  call mpp_init()
+  pe = mpp_pe()
+  npes = mpp_npes()
+
+#ifdef INTERNAL_FILE_NML
+  read (input_nml_file, test_mpp_io_R8_nml, iostat=io_status)
+#else
+  do
+     inquire( unit=unit, opened=opened )
+     if( .NOT.opened )exit
+     unit = unit + 1
+     if( unit.EQ.100 )call mpp_error( FATAL, 'Unable to locate unit number.' )
+  end do
+  open( unit=unit, file='input.nml', iostat=io_status)
+  read( unit,test_mpp_io_R8_nml, iostat=io_status )
+  close(unit)
+#endif
+
+  if (io_status > 0) then
+      call mpp_error(FATAL,'=>test_mpp_io_R8: Error reading input.nml')
+  endif
+
+  call SYSTEM_CLOCK( count_rate=tks_per_sec )
+  if( debug )then
+      call mpp_io_init(MPP_DEBUG)
+  else
+      call mpp_io_init()
+  end if
+  call mpp_set_stack_size(stackmax)
+  call mpp_domains_set_stack_size(stackmaxd)
+
+  write( file,'(a,i3.3)' )trim(file), npes
+
+! determine the pack_size
+  pack_size = size(transfer(doubledata, realarray))
+  if( pack_size .NE. 1 .AND. pack_size .NE. 2) call mpp_error(FATAL,'test_mpp_io_R8: pack_size should be 1 or 2')
+
+! test read/writes for different symmetries and positions and
+
+  if(ntiles_x == 1 .and. ntiles_y == 1 .and. io_layout(1) == 1 .and. io_layout(2) == 1) then
+     call test_netcdf_io_R4('Simple')
+     call test_netcdf_io_R4('Symmetry_T_cell')
+     call test_netcdf_io_R4('Symmetry_E_cell')
+     call test_netcdf_io_R4('Symmetry_N_cell')
+     call test_netcdf_io_R4('Symmetry_C_cell')
+     call test_netcdf_io_R4('Symmetry_T_cell_memory')
+     call test_netcdf_io_R4('Symmetry_E_cell_memory')
+     call test_netcdf_io_R4('Symmetry_N_cell_memory')
+     call test_netcdf_io_R4('Symmetry_C_cell_memory')
+     call test_netcdf_io_R8('Simple')
+     call test_netcdf_io_R8('Symmetry_T_cell')
+     call test_netcdf_io_R8('Symmetry_E_cell')
+     call test_netcdf_io_R8('Symmetry_N_cell')
+     call test_netcdf_io_R8('Symmetry_C_cell')
+     call test_netcdf_io_R8('Symmetry_T_cell_memory')
+     call test_netcdf_io_R8('Symmetry_E_cell_memory')
+     call test_netcdf_io_R8('Symmetry_N_cell_memory')
+     call test_netcdf_io_R8('Symmetry_C_cell_memory')
+  else
+     call mpp_error( FATAL, 'test_mpp_io_R8: Invalid nml parameters for non-mosaic files')
+  endif
+
+  call mpp_io_exit()
+  call mpp_domains_exit()
+  call mpp_exit()
+
+  contains
+
+  !------------------------------------------------------------------
+  !> Tests mpp_ reads and writes on netcdf files for both 32 and 64-bit reals
+  subroutine test_netcdf_io_R4(type)
+  character(len=*), intent(in) :: type
+  integer :: ndim, nvar, natt, ntime
+  integer :: is, ie, js, je, isd, ied, jsd, jed, ism, iem, jsm, jem
+  integer :: position, msize(2), ioff, joff, nxg, nyg
+  logical :: symmetry
+  type(atttype),          allocatable :: atts(:)
+  type(fieldtype),        allocatable :: vars(:)
+  type(axistype),         allocatable :: axes(:)
+  real(r8_kind),                   allocatable :: tstamp(:)
+  real(r4_kind), dimension(:,:,:), allocatable  :: data4, gdata4, rdata4
+  !--- determine the shift and symmetry according to type,
+  select case(type)
+  case('Simple')
+     position = CENTER; symmetry = .false.
+  case('Symmetry_T_cell', 'Symmetry_T_cell_memory')
+     position = CENTER; symmetry = .true.
+  case('Symmetry_E_cell', 'Symmetry_E_cell_memory')
+     position = EAST;   symmetry = .true.
+  case('Symmetry_N_cell', 'Symmetry_N_cell_memory')
+     position = NORTH;  symmetry = .true.
+  case('Symmetry_C_cell', 'Symmetry_C_cell_memory')
+     position = CORNER; symmetry = .true.
+  case default
+     call mpp_error(FATAL, "type = "//type//" is not a valid test type")
+  end select
+
+!define domain decomposition
+  call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+  if(index(type,"memory") == 0) then
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, xhalo=halo, yhalo=halo, symmetry = symmetry )
+  else  ! on memory domain
+     msize(1) = nx/layout(1) + 2*halo + 2
+     msize(2) = ny/layout(2) + 2*halo + 2
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, xhalo=halo, yhalo=halo, symmetry = symmetry, &
+                              memory_size = msize )
+  end if
+
+  call mpp_get_compute_domain( domain, is,  ie,  js,  je, position=position  )
+  call mpp_get_data_domain   ( domain, isd, ied, jsd, jed, position=position )
+  call mpp_get_memory_domain ( domain, ism, iem, jsm, jem, position=position )
+  call mpp_get_global_domain ( domain, xsize=nxg, ysize=nyg, position=position )
+  call mpp_get_domain_components( domain, xdom, ydom )
+
+!define global data arrays
+  allocate( gdata4(nxg,nyg,nz) )
+  gdata4 = 0.
+  do k = 1,nz
+     do j = 1,nyg
+        do i = 1,nxg
+           gdata4(i,j,k) = k + i*1e-3 + j*1e-6
+        end do
+     end do
+  end do
+
+  ioff = ism - isd; joff = jsm - jsd
+  allocate( data4(ism:iem,jsm:jem,nz) )
+  data4 = 0
+  data4(is+ioff:ie+ioff,js+joff:je+joff,:) = gdata4(is:ie,js:je,:)
+
+! test single thread ascii writes
+ if( nx*ny*nz*nt.LT.1000 .AND. index(type,"memory") .NE. 0) then
+  if( index(type,"memory") .NE. 0 )then
+      call mpp_open( unit, trim(file)//'sR4.txt', action=MPP_OVERWR, form=MPP_ASCII, threading=MPP_SINGLE )
+      call mpp_write_meta( unit, x, 'X', 'km', 'X distance', domain=xdom, data=(/(i-1.,i=1,nxg)/) )
+      call mpp_write_meta( unit, y, 'Y', 'km', 'Y distance', domain=ydom, data=(/(i-1.,i=1,nyg)/) )
+      call mpp_write_meta( unit, z, 'Z', 'km', 'Z distance',              data=(/(i-1.,i=1,nz)/) )
+      call mpp_write_meta( unit, t, 'T', 'sec', 'Time' )
+      call mpp_write_meta( unit, f, (/x,y,z,t/), 'Data', 'metres', 'Random data' )
+      call mpp_write( unit, x )
+      call mpp_write( unit, y )
+      call mpp_write( unit, z )
+      do i = 0,nt-1
+         time4 = i*10.
+         call mpp_write( unit, f, domain, data4, time4)
+      end do
+      call mpp_close(unit)
+  end if
+ end if
+!netCDF distributed write
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'dR4', action=MPP_OVERWR, &
+                 form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI )
+  call mpp_write_meta( unit, x, 'X', 'km', 'X distance', 'X', domain=xdom, data=(/(i-1.,i=1,nxg)/) )
+  call mpp_write_meta( unit, y, 'Y', 'km', 'Y distance', 'Y', domain=ydom, data=(/(i-1.,i=1,nyg)/) )
+  call mpp_write_meta( unit, z, 'Z', 'km', 'Z distance', 'Z', data=(/(i-1.,i=1,nz)/) )
+  call mpp_write_meta( unit, t, 'T', 'sec', 'Time', 'T' )
+  call mpp_write_meta( unit, f, (/x,y,z,t/), 'Data', 'metres', 'Random data', pack=pack_size )
+  call mpp_write( unit, x )
+  call mpp_write( unit, y )
+  call mpp_write( unit, z )
+  do i = 0,nt-1
+     time4 = i*10.
+     call mpp_write( unit, f, domain, data4, time4 )
+  end do
+  call mpp_close(unit)
+
+!netCDF single-threaded write
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'sR4', action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_SINGLE )
+
+  call mpp_write_meta( unit, x, 'X', 'km', 'X distance', 'X', domain=xdom, data=(/(i-1.,i=1,nxg)/) )
+
+  call mpp_write_meta( unit, y, 'Y', 'km', 'Y distance', 'Y', domain=ydom, data=(/(i-1.,i=1,nyg)/) )
+  call mpp_write_meta( unit, z, 'Z', 'km', 'Z distance', 'Z',              data=(/(i-1.,i=1,nz)/) )
+  call mpp_write_meta( unit, t, 'T', 'sec', 'Time', 'T' )
+  call mpp_write_meta( unit, f, (/x,y,z,t/), 'Data', 'metres', 'Random data', pack=pack_size )
+
+  call mpp_write( unit, x )
+  call mpp_write( unit, y )
+  call mpp_write( unit, z )
+
+  do i = 0,nt-1
+     time4 = i*10.
+     call mpp_write( unit, f, domain, data4, time4)
+  end do
+  call mpp_close(unit)
+!reopen and test appending write
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'sR4', action=MPP_APPEND, form=MPP_NETCDF, threading=MPP_SINGLE )
+  call mpp_write( unit, f, domain, data4, time4)
+  call mpp_close( unit )
+
+! clear out for reads
+  allocate( rdata4(is:ie,js:je,nz) )
+
+!netCDF multi-threaded read
+  call mpp_sync()
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'sR4', action=MPP_RDONLY,  &
+                 form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_SINGLE )
+  call mpp_get_info( unit, ndim, nvar, natt, ntime )
+  allocate( atts(natt) )
+  allocate( axes(ndim) )
+  allocate( vars(nvar) )
+  allocate( tstamp(ntime) )
+  call mpp_get_atts ( unit, atts(:) )
+  call mpp_get_axes ( unit, axes(:) )
+  call mpp_get_fields ( unit, vars(:) )
+  call mpp_get_times( unit, tstamp(:) )
+
+  call mpp_get_atts(vars(1),name=varname)
+
+  if( varname.NE.'Data' )call mpp_error( FATAL, 'File being read is not the expected one.' )
+  call mpp_read( unit, vars(1), domain, rdata4, 1 )
+  ! compare read and stored chksums
+  rchk = mpp_chksum(rdata4(is:ie,js:je,:))
+  chk  = mpp_chksum( data4(is+ioff:ie+ioff,js+joff:je+joff,:))
+  if( rchk == chk ) then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(type)//' R4: single-fileset: data comparison are OK.' )
+  else
+      call mpp_error( FATAL, 'R4 Checksum error on multi-threaded/single-fileset netCDF read for type ' &
+               //trim(type) )
+  end if
+  call mpp_close(unit)
+  deallocate( atts, axes, vars, tstamp )
+
+!netCDF distributed read
+  call mpp_sync()               !wait for previous write to complete
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'dR4', action=MPP_RDONLY,  &
+                 form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI )
+  call mpp_get_info( unit, ndim, nvar, natt, ntime )
+  allocate( atts(natt) )
+  allocate( axes(ndim) )
+  allocate( vars(nvar) )
+  allocate( tstamp(ntime) )
+  call mpp_get_atts ( unit, atts(:) )
+  call mpp_get_axes ( unit, axes(:) )
+  call mpp_get_fields ( unit, vars(:) )
+  call mpp_get_times( unit, tstamp(:) )
+
+  call mpp_get_atts(vars(1),name=varname)
+  rdata4 = 0
+
+  if( varname.NE.'Data' )call mpp_error( FATAL, 'File being read is not the expected one.' )
+
+  call mpp_read( unit, vars(1), domain, rdata4, 1 )
+  ! compare read and stored chksums
+  rchk = mpp_chksum(rdata4(is:ie,js:je,:))
+  chk  = mpp_chksum( data4(is+ioff:ie+ioff,js+joff:je+joff,:))
+  if( rchk == chk ) then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(type)//' R4: multi-fileset: data comparison are OK.' )
+  else
+      call mpp_error( FATAL, 'R4 Checksum error on multi-threaded/multi-fileset netCDF read for type ' &
+           //trim(type) )
+  end if
+  deallocate( atts, axes, vars, tstamp )
+  deallocate( rdata4, gdata4, data4)
+
+  end subroutine test_netcdf_io_R4
+
+  subroutine test_netcdf_io_R8(type)
+  character(len=*), intent(in) :: type
+  integer :: ndim, nvar, natt, ntime
+  integer :: is, ie, js, je, isd, ied, jsd, jed, ism, iem, jsm, jem
+  integer :: position, msize(2), ioff, joff, nxg, nyg
+  logical :: symmetry
+  type(atttype),          allocatable :: atts(:)
+  type(fieldtype),        allocatable :: vars(:)
+  type(axistype),         allocatable :: axes(:)
+  real(r8_kind),                   allocatable :: tstamp8(:)
+  real(r8_kind), dimension(:,:,:), allocatable :: data8, gdata8, rdata8
+  !--- determine the shift and symmetry according to type,
+  select case(type)
+  case('Simple')
+     position = CENTER; symmetry = .false.
+  case('Symmetry_T_cell', 'Symmetry_T_cell_memory')
+     position = CENTER; symmetry = .true.
+  case('Symmetry_E_cell', 'Symmetry_E_cell_memory')
+     position = EAST;   symmetry = .true.
+  case('Symmetry_N_cell', 'Symmetry_N_cell_memory')
+     position = NORTH;  symmetry = .true.
+  case('Symmetry_C_cell', 'Symmetry_C_cell_memory')
+     position = CORNER; symmetry = .true.
+  case default
+     call mpp_error(FATAL, "type = "//type//" is not a valid test type")
+  end select
+
+!define domain decomposition
+  call mpp_define_layout( (/1,nx,1,ny/), npes, layout )
+  if(index(type,"memory") == 0) then
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, xhalo=halo, yhalo=halo, symmetry = symmetry )
+  else  ! on memory domain
+     msize(1) = nx/layout(1) + 2*halo + 2
+     msize(2) = ny/layout(2) + 2*halo + 2
+     call mpp_define_domains( (/1,nx,1,ny/), layout, domain, xhalo=halo, yhalo=halo, symmetry = symmetry, &
+                              memory_size = msize )
+  end if
+
+  call mpp_get_compute_domain( domain, is,  ie,  js,  je, position=position  )
+  call mpp_get_data_domain   ( domain, isd, ied, jsd, jed, position=position )
+  call mpp_get_memory_domain ( domain, ism, iem, jsm, jem, position=position )
+  call mpp_get_global_domain ( domain, xsize=nxg, ysize=nyg, position=position )
+  call mpp_get_domain_components( domain, xdom, ydom )
+
+!define global data arrays for 64 bit reals
+  allocate( gdata8(nxg,nyg,nz) )
+  gdata8 = 0.
+  do k = 1,nz
+     do j = 1,nyg
+        do i = 1,nxg
+           gdata8(i,j,k) = k + i*1e-3 + j*1e-6
+        end do
+     end do
+  end do
+
+  ioff = ism - isd; joff = jsm - jsd
+  allocate( data8(ism:iem,jsm:jem,nz) )
+  data8 = 0
+  data8(is+ioff:ie+ioff,js+joff:je+joff,:) = gdata8(is:ie,js:je,:)
+
+
+!sequential write: single-threaded formatted: only if small
+! test ascii writes
+  if( nx*ny*nz*nt.LT.1000 .AND. index(type,"memory") .NE. 0 )then
+!here the only test is a successful write: please look at test.txt for verification.
+      call mpp_open( unit, trim(file)//'sR8.txt', action=MPP_OVERWR, form=MPP_ASCII, threading=MPP_SINGLE )
+      call mpp_write_meta( unit, x, 'X', 'km', 'X distance', domain=xdom, data=(/(i-1.,i=1,nxg)/) )
+      call mpp_write_meta( unit, y, 'Y', 'km', 'Y distance', domain=ydom, data=(/(i-1.,i=1,nyg)/) )
+      call mpp_write_meta( unit, z, 'Z', 'km', 'Z distance',              data=(/(i-1.,i=1,nz)/) )
+      call mpp_write_meta( unit, t, 'T', 'sec', 'Time' )
+      call mpp_write_meta( unit, f, (/x,y,z,t/), 'Data', 'metres', 'Random data' )
+      call mpp_write( unit, x )
+      call mpp_write( unit, y )
+      call mpp_write( unit, z )
+      do i = 0,nt-1
+         time8 = i*10.
+         call mpp_write( unit, f, domain, data8, time8)
+      end do
+      call mpp_close(unit)
+  end if
+
+!netCDF distributed write
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'dR8', action=MPP_OVERWR, &
+                 form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI )
+  call mpp_write_meta( unit, x, 'X', 'km', 'X distance', 'X', domain=xdom, data=(/(i-1.,i=1,nxg)/) )
+  call mpp_write_meta( unit, y, 'Y', 'km', 'Y distance', 'Y', domain=ydom, data=(/(i-1.,i=1,nyg)/) )
+  call mpp_write_meta( unit, z, 'Z', 'km', 'Z distance', 'Z', data=(/(i-1.,i=1,nz)/) )
+  call mpp_write_meta( unit, t, 'T', 'sec', 'Time', 'T' )
+  call mpp_write_meta( unit, f, (/x,y,z,t/), 'Data', 'metres', 'Random data', pack=pack_size )
+  call mpp_write( unit, x )
+  call mpp_write( unit, y )
+  call mpp_write( unit, z )
+  do i = 0,nt-1
+     time8 = i*10.
+     call mpp_write( unit, f, domain, data8, time8 )
+  end do
+  call mpp_close(unit)
+
+!netCDF single-threaded write
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'sR8', action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_SINGLE )
+
+  call mpp_write_meta( unit, x, 'X', 'km', 'X distance', 'X', domain=xdom, data=(/(i-1.,i=1,nxg)/) )
+
+  call mpp_write_meta( unit, y, 'Y', 'km', 'Y distance', 'Y', domain=ydom, data=(/(i-1.,i=1,nyg)/) )
+  call mpp_write_meta( unit, z, 'Z', 'km', 'Z distance', 'Z',              data=(/(i-1.,i=1,nz)/) )
+  call mpp_write_meta( unit, t, 'T', 'sec', 'Time', 'T' )
+  call mpp_write_meta( unit, f, (/x,y,z,t/), 'Data', 'metres', 'Random data', pack=pack_size )
+
+  call mpp_write( unit, x )
+  call mpp_write( unit, y )
+  call mpp_write( unit, z )
+
+  do i = 0,nt-1
+     time8 = i*10.
+     call mpp_write( unit, f, domain, data8, time8)
+  end do
+  call mpp_close(unit)
+  allocate( rdata8(is:ie,js:je,nz) )
+
+!reopen and test appending write
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'sR8', action=MPP_APPEND, form=MPP_NETCDF, threading=MPP_SINGLE )
+  call mpp_write( unit, f, domain, data8, time8)
+  call mpp_close( unit )
+
+!netCDF multi-threaded read
+  call mpp_sync()
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'sR8', action=MPP_RDONLY,  &
+                 form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_SINGLE )
+  call mpp_get_info( unit, ndim, nvar, natt, ntime )
+  allocate( atts(natt) )
+  allocate( axes(ndim) )
+  allocate( vars(nvar) )
+  allocate( tstamp8(ntime) )
+  call mpp_get_atts ( unit, atts(:) )
+  call mpp_get_axes ( unit, axes(:) )
+  call mpp_get_fields ( unit, vars(:) )
+  call mpp_get_times( unit, tstamp8(:) )
+
+  call mpp_get_atts(vars(1),name=varname)
+
+  if( varname.NE.'Data' )call mpp_error( FATAL, 'File being read is not the expected one.' )
+  call mpp_read( unit, vars(1), domain, rdata8, 1 )
+  rchk = mpp_chksum(rdata8(is:ie,js:je,:))
+  chk  = mpp_chksum( data8(is+ioff:ie+ioff,js+joff:je+joff,:))
+  if( rchk == chk ) then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(type)//' R8: single-fileset: data comparison are OK.' )
+  else
+      call mpp_error( FATAL, 'R8 Checksum error on multi-threaded/single-fileset netCDF read for type ' &
+               //trim(type) )
+  end if
+
+  call mpp_close(unit)
+  deallocate( atts, axes, vars, tstamp8 )
+
+!netCDF distributed read
+  call mpp_sync()               !wait for previous write to complete
+  call mpp_open( unit, trim(type)//"_"//trim(file)//'dR8', action=MPP_RDONLY,  &
+                 form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI )
+  call mpp_get_info( unit, ndim, nvar, natt, ntime )
+  allocate( atts(natt) )
+  allocate( axes(ndim) )
+  allocate( vars(nvar) )
+  allocate( tstamp8(ntime) )
+  call mpp_get_atts ( unit, atts(:) )
+  call mpp_get_axes ( unit, axes(:) )
+  call mpp_get_fields ( unit, vars(:) )
+  call mpp_get_times( unit, tstamp8(:) )
+
+  call mpp_get_atts(vars(1),name=varname)
+  rdata8 = 0
+
+  if( varname.NE.'Data' )call mpp_error( FATAL, 'File being read is not the expected one.' )
+
+  call mpp_read( unit, vars(1), domain, rdata8, 1 )
+
+  rchk = mpp_chksum(rdata8(is:ie,js:je,:))
+  chk  = mpp_chksum( data8(is+ioff:ie+ioff,js+joff:je+joff,:))
+  if( rchk == chk ) then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(type)//' R8: multi-fileset: data comparison are OK.' )
+  else
+      call mpp_error( FATAL, 'R8 Checksum error on multi-threaded/multi-fileset netCDF read for type ' &
+           //trim(type) )
+  end if
+
+  deallocate( atts, axes, vars, tstamp8 )
+  deallocate( rdata8, gdata8, data8)
+
+  end subroutine test_netcdf_io_R8
+
+end program test_io_R4_R8
diff --git a/test_fms/mpp_io/test_io_R4_R8.sh b/test_fms/mpp_io/test_io_R4_R8.sh
new file mode 100755
index 0000000000..82bd26d5b9
--- /dev/null
+++ b/test_fms/mpp_io/test_io_R4_R8.sh
@@ -0,0 +1,60 @@
+#!/bin/sh
+
+#***********************************************************************
+#*                   GNU Lesser General Public License
+#*
+#* This file is part of the GFDL Flexible Modeling System (FMS).
+#*
+#* FMS is free software: you can redistribute it and/or modify it under
+#* the terms of the GNU Lesser General Public License as published by
+#* the Free Software Foundation, either version 3 of the License, or (at
+#* your option) any later version.
+#*
+#* FMS is distributed in the hope that it will be useful, but WITHOUT
+#* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+#* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#* for more details.
+#*
+#* You should have received a copy of the GNU Lesser General Public
+#* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Ryan Mulhall
+
+# Set common test settings.
+. ../test_common.sh
+
+skip_test="no"
+
+# Copy file for test.
+touch input.nml
+cp $top_srcdir/test_fms/mpp_io/input_base.nml input.nml
+
+
+# Get the number of available CPUs on the system
+if [ $(command -v nproc) ]
+then
+   # Looks like a linux system
+   nProc=$(nproc)
+elif [ $(command -v sysctl) ]
+then
+   # Looks like a Mac OS X system
+   nProc=$(sysctl -n hw.physicalcpu)
+else
+   nProc=-1
+fi
+
+if [ $nProc -lt 0 ]
+then
+   # Couldn't get the number of CPUs, skip the test.
+   skip_test="skip"
+elif [ $nProc -lt 12 ]
+then
+   # Need to oversubscribe the MPI
+   run_test test_io_R4_R8 12 $skip_test "true"
+fi
+
+run_test test_io_R4_R8 12 $skip_test
diff --git a/test_fms/mpp_io/test_io_mosaic_R4_R8.F90 b/test_fms/mpp_io/test_io_mosaic_R4_R8.F90
new file mode 100644
index 0000000000..38b1a361aa
--- /dev/null
+++ b/test_fms/mpp_io/test_io_mosaic_R4_R8.F90
@@ -0,0 +1,421 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* FMS is free software: you can redistribute it and/or modify it under
+!* the terms of the GNU Lesser General Public License as published by
+!* the Free Software Foundation, either version 3 of the License, or (at
+!* your option) any later version.
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+!* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+!* for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+!> @file
+!> @author Ryan Mulhall
+!> @brief Unit test for mpp_write/mpp_read on mosaics
+!> @email gfdl.climate.model.info@noaa.gov
+!> @description Performs reads and writes on mosaic files using mpp_write
+!> and mpp_read using 32 and 64 bit reals
+program test_io_mosaic_R4_R8
+
+  use platform_mod
+  use mpp_mod,         only : mpp_init, mpp_pe, mpp_npes, mpp_root_pe, mpp_error, mpp_sync_self
+  use mpp_mod,         only : FATAL, NOTE, mpp_chksum, MPP_DEBUG, mpp_set_stack_size, MPP_CLOCK_SYNC
+  use mpp_mod,         only : mpp_sync, mpp_exit, mpp_clock_begin, mpp_clock_end, mpp_clock_id
+  use mpp_mod,         only : mpp_init_test_full_init
+  use mpp_domains_mod, only : mpp_define_domains, mpp_domains_set_stack_size, domain1D, mpp_get_global_domain
+  use mpp_domains_mod, only : domain2D, mpp_define_layout, mpp_get_domain_components, mpp_define_mosaic
+  use mpp_domains_mod, only : mpp_get_memory_domain, mpp_get_compute_domain, mpp_domains_exit
+  use mpp_domains_mod, only : CENTER, EAST, NORTH, CORNER, mpp_get_data_domain
+  use mpp_domains_mod, only : mpp_define_io_domain, mpp_deallocate_domain
+  use mpp_io_mod,      only : mpp_io_init, mpp_write_meta, axistype, fieldtype, atttype
+  use mpp_io_mod,      only : MPP_RDONLY, mpp_open, MPP_OVERWR, MPP_ASCII, MPP_SINGLE
+  use mpp_io_mod,      only : MPP_NETCDF, MPP_MULTI, mpp_get_atts, mpp_write, mpp_close
+  use mpp_io_mod,      only : mpp_get_info, mpp_get_axes, mpp_get_fields, mpp_get_times
+  use mpp_io_mod,      only : mpp_read, mpp_io_exit, MPP_APPEND
+
+#ifdef INTERNAL_FILE_NML
+  USE mpp_mod, ONLY: input_nml_file
+#endif
+
+  implicit none
+
+#ifdef use_netCDF
+#include <netcdf.inc>
+#endif
+
+  !--- namelist definition
+  integer           :: nx=360, ny=200, nz=50, nt=2
+  integer           :: halo=2, stackmax=1500000, stackmaxd=2000000
+  logical           :: debug=.FALSE.
+  character(len=64) :: file='test_mosaic', iospec='-F cachea'
+  integer           :: layout(2) = (/1,1/)
+  integer           :: ntiles_x=3, ntiles_y=4  ! total number of tiles will be ntiles_x*ntiles_y,
+                                               ! the grid size for each tile will be (nx/ntiles_x, ny/ntiles_y)
+                                               ! set ntiles > 1 to test the efficiency of mpp_io.
+  integer           :: io_layout(2) = (/1,1/)  ! set io_layout to divide each tile into io_layout(1)*io_layout(2)
+                                               ! group and write out data from the root pe of each group.
+  integer           :: pack_size = 1
+
+  namelist / test_io_mosaic_nml / nx, ny, nz, nt, halo, stackmax, stackmaxd, debug, file, iospec, &
+                               ntiles_x, ntiles_y, layout, io_layout
+
+  integer        :: pe, npes, io_status
+  type(domain2D) :: domain
+  integer            :: tks_per_sec
+  integer            :: i,j,k, unit=7
+  logical            :: opened
+  character(len=64)  :: varname
+  real(r8_kind)      :: time8
+  real(r4_kind)      :: time4
+  type(axistype)     :: x, y, z, t
+  type(fieldtype)    :: f
+  type(domain1D)     :: xdom, ydom
+  integer(i8_kind)   :: rchk, chk
+  real(r8_kind)      :: doubledata = 0.0
+  real               :: realarray(4)
+  integer            :: ierr
+! initialize mpp and read input namelist
+  call mpp_init(test_level=mpp_init_test_full_init)
+  pe = mpp_pe()
+  npes = mpp_npes()
+  do
+     inquire( unit=unit, opened=opened )
+     if( .NOT.opened )exit
+     unit = unit + 1
+     if( unit.EQ.100 )call mpp_error( FATAL, 'Unable to locate unit number.' )
+  end do
+  open( unit=unit, file='input.nml', iostat=io_status)
+  read( unit,test_io_mosaic_nml, iostat=io_status )
+  close(unit)
+
+      if (io_status > 0) then
+         call mpp_error(FATAL,'=>test_io_mosaic_R4_R8: Error reading input.nml')
+      endif
+
+  call SYSTEM_CLOCK( count_rate=tks_per_sec )
+  if( debug )then
+      call mpp_io_init(MPP_DEBUG)
+  else
+      call mpp_io_init()
+  end if
+  call mpp_set_stack_size(stackmax)
+  call mpp_domains_set_stack_size(stackmaxd)
+
+  if( pe.EQ.mpp_root_pe() )then
+      print '(a,6i6)', 'npes, nx, ny, nz, nt, halo=', npes, nx, ny, nz, nt, halo
+      print *, 'Using NEW domaintypes and calls...'
+  end if
+
+  write( file,'(a,i3.3)' )trim(file), npes
+! set layouts to test with mosaic tiles
+  io_layout(1) = 3
+  io_layout(2) = 2
+  layout(1) = 3
+  layout(2) = 4
+! determine the pack_size
+  pack_size = size(transfer(doubledata, realarray))
+  if( pack_size .NE. 1 .AND. pack_size .NE. 2) call mpp_error(FATAL,'test_io_mosaic_R4_R8: pack_size should be 1 or 2')
+
+  ! test different mosaic reads with r4
+  call test_netcdf_io_mosaic_R4('Single_tile_mult_file_R4', layout, 1, 1, (/1,1/) )
+  call test_netcdf_io_mosaic_R4('Single_tile_with_group_R4', layout, 1, 1, io_layout)
+  call test_netcdf_io_mosaic_R4('Mult_tile_R4', layout, io_layout(1), io_layout(2), (/1,1/))
+  call test_netcdf_io_mosaic_R4('Mult_tile_with_group_R4', layout, ntiles_x, ntiles_y, io_layout)
+  ! test different mosaic reads with r8
+  call test_netcdf_io_mosaic_R8('Single_tile_mult_file_R8', layout, 1, 1, (/1,1/) )
+  call test_netcdf_io_mosaic_R8('Single_tile_with_group_R8', layout, 1, 1, io_layout)
+  call test_netcdf_io_mosaic_R8('Mult_tile_R8', layout, io_layout(1), io_layout(2), (/1,1/))
+  call test_netcdf_io_mosaic_R8('Mult_tile_with_group_R8', layout, ntiles_x, ntiles_y, io_layout)
+
+  call mpp_io_exit()
+  call mpp_domains_exit()
+  call MPI_FINALIZE(ierr)
+
+  contains
+
+  !------------------------------------------------------------------
+  subroutine test_netcdf_io_mosaic_R4(type, layout, ntiles_x, ntiles_y, io_layout)
+  character(len=*), intent(in) :: type
+  integer,          intent(in) :: layout(:)
+  integer,          intent(in) :: io_layout(:)
+  integer,          intent(in) :: ntiles_x, ntiles_y
+
+  integer                              :: ndim, nvar, natt, ntime
+  integer                              :: isc, iec, jsc, jec, nlon, nlat, n, i, j
+  integer                              :: my_tile, ncontacts, npes_per_tile, ntiles
+  integer, dimension(:),   allocatable :: tile1, istart1, iend1, jstart1, jend1
+  integer, dimension(:),   allocatable :: tile2, istart2, iend2, jstart2, jend2
+  integer, dimension(:),   allocatable :: pe_start, pe_end
+  integer, dimension(:,:), allocatable :: layout2D, global_indices
+  character(len=64)                    :: output_file
+  logical                              :: is_root_pe
+  real(r4_kind), dimension(:,:,:), allocatable  :: data, rdata
+  type(fieldtype), save                :: vars(1)
+
+  npes = mpp_npes()
+
+  ncontacts = 0
+  ntiles = ntiles_x*ntiles_y
+
+  npes_per_tile = npes/ntiles
+  my_tile       = mpp_pe()/npes_per_tile + 1
+  is_root_pe = .false.
+  if(mpp_pe() == (my_tile-1)*npes_per_tile ) is_root_pe = .true.
+
+  allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
+  !--- for simplify purpose, we assume all the tiles have the same size.
+  do n = 1, ntiles
+     pe_start(n) = (n-1)*npes_per_tile
+     pe_end(n)   = n*npes_per_tile-1
+  end do
+  if(ntiles>1) then
+     nlon = nx/ntiles_x
+     nlat = ny/ntiles_y
+  else
+     nlon = nx
+     nlat = ny
+  endif
+
+  do n = 1, ntiles
+     global_indices(:,n) = (/1,nlon,1,nlat/)
+     layout2D(1,n)         = layout(1)/ntiles_x
+     layout2D(2,n)         = layout(2)/ntiles_y
+  end do
+
+  call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, ncontacts, tile1, tile2, &
+                         istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2, pe_start, pe_end, &
+                         name = type)
+  call mpp_get_compute_domain( domain, isc,  iec,  jsc,  jec  )
+  call mpp_get_domain_components(domain, xdom, ydom)
+  allocate( data (isc:iec,jsc:jec,nz) )
+  allocate( rdata(isc:iec,jsc:jec,nz) )
+  do k = 1,nz
+     do j = jsc, jec
+        do i = isc, iec
+           data(i,j,k)  = k + i*1e-3 + j*1e-6
+        enddo
+     enddo
+  enddo
+
+  ! open with netcdf distribute write if ntiles = 1, otherwise single-thread write
+  output_file = type
+  select case(type)
+  case("Single_tile_single_file_R4")
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_SINGLE, fileset=MPP_SINGLE )
+  case("Single_tile_mult_file_R4")
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI )
+  case("Mult_tile_R4")
+     write(output_file, '(a,I4.4)') type//'.tile', my_tile
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_SINGLE, is_root_pe=is_root_pe )
+  case("Single_tile_with_group_R4")
+     call mpp_define_io_domain(domain, io_layout)
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI, domain=domain)
+  case("Mult_tile_with_group_R4")
+     write(output_file, '(a,I4.4)') type//'.tile', my_tile
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI, domain=domain)
+
+  case default
+     call mpp_error(FATAL, "program test_io_mosaic_R4_R8: invaid value of type="//type)
+  end select
+  ! write data
+  call mpp_write_meta( unit, x, 'X', 'km', 'X distance', 'X', domain=xdom, data=(/(i-1.,i=1,nlon)/) )
+  call mpp_write_meta( unit, y, 'Y', 'km', 'Y distance', 'Y', domain=ydom, data=(/(i-1.,i=1,nlat)/) )
+  call mpp_write_meta( unit, z, 'Z', 'km', 'Z distance', 'Z', data=(/(i-1.,i=1,nz)/) )
+  call mpp_write_meta( unit, t, 'T', 'sec', 'Time', 'T' )
+  call mpp_write_meta( unit, f, (/x,y,z,t/), 'Data', 'metres', 'Random data', pack=pack_size )
+  call mpp_write( unit, x )
+  call mpp_write( unit, y )
+  call mpp_write( unit, z )
+  do i = 0,nt-1
+     time4 = i*10.
+     call mpp_write( unit, f, domain, data, time4 )
+  end do
+  call mpp_close(unit)
+
+  call mpp_sync()               !wait for previous write to complete
+  ! reopen file and check results
+  select case(type)
+  case("Single_tile_single_file_R4")
+     call mpp_open( unit, output_file, action=MPP_RDONLY, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_SINGLE )
+  case("Single_tile_mult_file_R4")
+     call mpp_open( unit, output_file, action=MPP_RDONLY, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI )
+  case("Mult_tile_R4")
+     call mpp_open( unit, output_file, action=MPP_RDONLY, form=MPP_NETCDF, threading=MPP_MULTI, &
+         fileset=MPP_SINGLE, is_root_pe=is_root_pe )
+  case("Single_tile_with_group_R4", "Mult_tile_with_group_R4")
+     call mpp_open( unit, output_file, action=MPP_RDONLY, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI, domain=domain)
+  case default
+     call mpp_error(FATAL, "program test_io_mosaic_R4_R8: invaid value of type="//type)
+  end select
+
+  call mpp_get_info( unit, ndim, nvar, natt, ntime )
+  call mpp_get_fields ( unit, vars(:) )
+  call mpp_get_atts(vars(1),name=varname)
+
+  if( varname.NE.'Data' )call mpp_error( FATAL, 'File being read is not the expected one.' )
+  do i = 0,nt-1
+     call mpp_read( unit, vars(1), domain, rdata, 1 )
+  enddo
+  ! compare read and stored data to validate successful write/read
+  rchk = mpp_chksum(rdata)
+  chk  = mpp_chksum( data)
+  if( pe.EQ.mpp_root_pe() )print '(a,2z18)', trim(type)//' checksum=', rchk, chk
+  if( rchk == chk ) then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(type)//': data comparison are OK.' )
+  else
+      call mpp_error( FATAL, 'Checksum error on netCDF read for type ' &
+               //trim(type) )
+  end if
+
+  deallocate( rdata, data)
+  call mpp_deallocate_domain(domain)
+
+  end subroutine test_netcdf_io_mosaic_R4
+
+  subroutine test_netcdf_io_mosaic_R8(type, layout, ntiles_x, ntiles_y, io_layout)
+  character(len=*), intent(in) :: type
+  integer,          intent(in) :: layout(:)
+  integer,          intent(in) :: io_layout(:)
+  integer,          intent(in) :: ntiles_x, ntiles_y
+
+  integer                              :: ndim, nvar, natt, ntime
+  integer                              :: isc, iec, jsc, jec, nlon, nlat, n, i, j
+  integer                              :: my_tile, ncontacts, npes_per_tile, ntiles
+  integer, dimension(:),   allocatable :: tile1, istart1, iend1, jstart1, jend1
+  integer, dimension(:),   allocatable :: tile2, istart2, iend2, jstart2, jend2
+  integer, dimension(:),   allocatable :: pe_start, pe_end
+  integer, dimension(:,:), allocatable :: layout2D, global_indices
+  character(len=64)                    :: output_file
+  logical                              :: is_root_pe
+  real(r8_kind), dimension(:,:,:), allocatable  :: data, rdata
+  type(fieldtype), save                :: vars(1)
+
+  npes = mpp_npes()
+
+  ncontacts = 0
+  ntiles = ntiles_x*ntiles_y
+
+  npes_per_tile = npes/ntiles
+  my_tile       = mpp_pe()/npes_per_tile + 1
+  is_root_pe = .false.
+  if(mpp_pe() == (my_tile-1)*npes_per_tile ) is_root_pe = .true.
+
+  allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
+  !--- for simplify purpose, we assume all the tiles have the same size.
+  do n = 1, ntiles
+     pe_start(n) = (n-1)*npes_per_tile
+     pe_end(n)   = n*npes_per_tile-1
+  end do
+  if(ntiles>1) then
+     nlon = nx/ntiles_x
+     nlat = ny/ntiles_y
+  else
+     nlon = nx
+     nlat = ny
+  endif
+
+  do n = 1, ntiles
+     global_indices(:,n) = (/1,nlon,1,nlat/)
+     layout2D(1,n)         = layout(1)/ntiles_x
+     layout2D(2,n)         = layout(2)/ntiles_y
+  end do
+
+  call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, ncontacts, tile1, tile2, &
+                         istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2, pe_start, pe_end, &
+                         name = type)
+  call mpp_get_compute_domain( domain, isc,  iec,  jsc,  jec  )
+  call mpp_get_domain_components(domain, xdom, ydom)
+  allocate( data (isc:iec,jsc:jec,nz) )
+  allocate( rdata(isc:iec,jsc:jec,nz) )
+  do k = 1,nz
+     do j = jsc, jec
+        do i = isc, iec
+           data(i,j,k)  = k + i*1e-3 + j*1e-6
+        enddo
+     enddo
+  enddo
+
+  ! open with netcdf distribute write if ntiles = 1, otherwise single-thread write
+  output_file = type
+  select case(type)
+  case("Single_tile_single_file_R8")
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_SINGLE, fileset=MPP_SINGLE )
+  case("Single_tile_mult_file_R8")
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI )
+  case("Mult_tile_R8")
+     write(output_file, '(a,I4.4)') type//'.tile', my_tile
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_SINGLE, is_root_pe=is_root_pe )
+  case("Single_tile_with_group_R8")
+     call mpp_define_io_domain(domain, io_layout)
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI, domain=domain)
+  case("Mult_tile_with_group_R8")
+     write(output_file, '(a,I4.4)') type//'.tile', my_tile
+     call mpp_open( unit, output_file, action=MPP_OVERWR, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI, domain=domain)
+
+  case default
+     call mpp_error(FATAL, "program test_io_mosaic_R4_R8: invaid value of type="//type)
+  end select
+  ! write data
+  call mpp_write_meta( unit, x, 'X', 'km', 'X distance', 'X', domain=xdom, data=(/(i-1.,i=1,nlon)/) )
+  call mpp_write_meta( unit, y, 'Y', 'km', 'Y distance', 'Y', domain=ydom, data=(/(i-1.,i=1,nlat)/) )
+  call mpp_write_meta( unit, z, 'Z', 'km', 'Z distance', 'Z', data=(/(i-1.,i=1,nz)/) )
+  call mpp_write_meta( unit, t, 'T', 'sec', 'Time', 'T' )
+  call mpp_write_meta( unit, f, (/x,y,z,t/), 'Data', 'metres', 'Random data', pack=pack_size )
+  call mpp_write( unit, x )
+  call mpp_write( unit, y )
+  call mpp_write( unit, z )
+  do i = 0,nt-1
+     time8 = i*10.
+     call mpp_write( unit, f, domain, data, time8 )
+  end do
+  call mpp_close(unit)
+
+  call mpp_sync()               !wait for previous write to complete
+  ! reopen file and check results
+  select case(type)
+  case("Single_tile_single_file_R8")
+     call mpp_open( unit, output_file, action=MPP_RDONLY, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_SINGLE )
+  case("Single_tile_mult_file_R8")
+     call mpp_open( unit, output_file, action=MPP_RDONLY, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI )
+  case("Mult_tile_R8")
+     call mpp_open( unit, output_file, action=MPP_RDONLY, form=MPP_NETCDF, threading=MPP_MULTI, &
+         fileset=MPP_SINGLE, is_root_pe=is_root_pe )
+  case("Single_tile_with_group_R8", "Mult_tile_with_group_R8")
+     call mpp_open( unit, output_file, action=MPP_RDONLY, form=MPP_NETCDF, threading=MPP_MULTI, fileset=MPP_MULTI, domain=domain)
+  case default
+     call mpp_error(FATAL, "program test_io_mosaic_R4_R8: invaid value of type="//type)
+  end select
+
+  call mpp_get_info( unit, ndim, nvar, natt, ntime )
+  call mpp_get_fields ( unit, vars(:) )
+  call mpp_get_atts(vars(1),name=varname)
+
+  if( varname.NE.'Data' )call mpp_error( FATAL, 'File being read is not the expected one.' )
+  do i = 0,nt-1
+     call mpp_read( unit, vars(1), domain, rdata, 1 )
+  enddo
+  ! compare read and stored data to validate successful write/read
+  rchk = mpp_chksum(rdata)
+  chk  = mpp_chksum( data)
+  if( pe.EQ.mpp_root_pe() )print '(a,2z18)', trim(type)//' checksum=', rchk, chk
+  if( rchk == chk ) then
+      if( pe.EQ.mpp_root_pe() )call mpp_error( NOTE, trim(type)//': data comparison are OK.' )
+  else
+      call mpp_error( FATAL, 'Checksum error on netCDF read for type ' &
+               //trim(type) )
+  end if
+
+
+  deallocate( rdata, data)
+  call mpp_deallocate_domain(domain)
+
+  end subroutine test_netcdf_io_mosaic_R8
+
+end program test_io_mosaic_R4_R8
diff --git a/test_fms/mpp_io/test_io_mosaic_R4_R8.sh b/test_fms/mpp_io/test_io_mosaic_R4_R8.sh
new file mode 100755
index 0000000000..aa3109aaf2
--- /dev/null
+++ b/test_fms/mpp_io/test_io_mosaic_R4_R8.sh
@@ -0,0 +1,60 @@
+#!/bin/sh
+
+#***********************************************************************
+#*                   GNU Lesser General Public License
+#*
+#* This file is part of the GFDL Flexible Modeling System (FMS).
+#*
+#* FMS is free software: you can redistribute it and/or modify it under
+#* the terms of the GNU Lesser General Public License as published by
+#* the Free Software Foundation, either version 3 of the License, or (at
+#* your option) any later version.
+#*
+#* FMS is distributed in the hope that it will be useful, but WITHOUT
+#* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+#* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#* for more details.
+#*
+#* You should have received a copy of the GNU Lesser General Public
+#* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
+#***********************************************************************
+
+# This is part of the GFDL FMS package. This is a shell script to
+# execute tests in the test_fms/mpp directory.
+
+# Ryan Mulhall
+
+# Set common test settings.
+. ../test_common.sh
+
+skip_test="no"
+
+# Copy file for test.
+touch input.nml
+cp $top_srcdir/test_fms/mpp_io/input_base.nml input.nml
+
+
+# Get the number of available CPUs on the system
+if [ $(command -v nproc) ]
+then
+   # Looks like a linux system
+   nProc=$(nproc)
+elif [ $(command -v sysctl) ]
+then
+   # Looks like a Mac OS X system
+   nProc=$(sysctl -n hw.physicalcpu)
+else
+   nProc=-1
+fi
+
+if [ $nProc -lt 0 ]
+then
+   # Couldn't get the number of CPUs, skip the test.
+   skip_test="skip"
+elif [ $nProc -lt 12 ]
+then
+   # Need to oversubscribe the MPI
+   run_test test_io_mosaic_R4_R8 12 $skip_test "true"
+fi
+
+run_test test_io_mosaic_R4_R8 12 $skip_test
diff --git a/test_fms/mpp_io/test_mpp_io.F90 b/test_fms/mpp_io/test_mpp_io.F90
index 41bc17337b..ffad0a13b2 100644
--- a/test_fms/mpp_io/test_mpp_io.F90
+++ b/test_fms/mpp_io/test_mpp_io.F90
@@ -17,8 +17,8 @@
 !* License along with FMS.  If not, see <http://www.gnu.org/licenses/>.
 !***********************************************************************
 program test
-#include <fms_platform.h>
 
+  use platform_mod,    only : i8_kind, r8_kind
   use mpp_mod,         only : mpp_init, mpp_pe, mpp_npes, mpp_root_pe, mpp_error, mpp_sync_self
   use mpp_mod,         only : FATAL, NOTE, mpp_chksum, MPP_DEBUG, mpp_set_stack_size, MPP_CLOCK_SYNC
   use mpp_mod,         only : mpp_sync, mpp_exit, mpp_clock_begin, mpp_clock_end, mpp_clock_id
@@ -76,8 +76,8 @@ program test
   type(axistype)     :: x, y, z, t
   type(fieldtype)    :: f
   type(domain1D)     :: xdom, ydom
-  integer(LONG_KIND) :: rchk, chk
-  real(DOUBLE_KIND)                  :: doubledata = 0.0
+  integer(i8_kind) :: rchk, chk
+  real(r8_kind)                  :: doubledata = 0.0
   real                               :: realarray(4)
 
   call mpp_init()
diff --git a/test_fms/mpp_io/test_mpp_io2.sh b/test_fms/mpp_io/test_mpp_io2.sh
index 18036e89f9..fffbf3d4a5 100755
--- a/test_fms/mpp_io/test_mpp_io2.sh
+++ b/test_fms/mpp_io/test_mpp_io2.sh
@@ -30,6 +30,7 @@
 skip_test="no"
 
 # Copy file for test.
+touch input.nml
 cp $top_srcdir/test_fms/mpp_io/input_base.nml input.nml
 
 
diff --git a/test_fms/time_interp/Makefile.am b/test_fms/time_interp/Makefile.am
index bbbeab8195..53110b1d43 100644
--- a/test_fms/time_interp/Makefile.am
+++ b/test_fms/time_interp/Makefile.am
@@ -23,7 +23,8 @@
 # uramirez, Ed Hartnett
 
 # Find the fms_mod.mod file.
-AM_CPPFLAGS = -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_builddir}/.mod \
+  -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/test_fms/time_manager/Makefile.am b/test_fms/time_manager/Makefile.am
index 24de202652..e3347b1a84 100644
--- a/test_fms/time_manager/Makefile.am
+++ b/test_fms/time_manager/Makefile.am
@@ -23,7 +23,7 @@
 # uramirez, Ed Hartnett
 
 # Find the fms_mod.mod file.
-AM_CPPFLAGS = -I${top_builddir}/.mod
+AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_builddir}/platform
 
 # Link to the FMS library.
 LDADD = ${top_builddir}/libFMS/libFMS.la
diff --git a/time_manager/Makefile.am b/time_manager/Makefile.am
index c0918f8f35..0139fb43a5 100644
--- a/time_manager/Makefile.am
+++ b/time_manager/Makefile.am
@@ -23,7 +23,11 @@
 # Ed Hartnett 2/22/19
 
 # Include .h and .mod files.
-AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS = -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
+AM_CPPFLAGS += -I${top_builddir}/constants
+AM_CPPFLAGS += -I${top_builddir}/fms
+AM_CPPFLAGS += -I${top_builddir}/mpp
 
 # Build these uninstalled convenience libraries.
 noinst_LTLIBRARIES = libtime_manager.la libget_cal_time.la
diff --git a/time_manager/time_manager.F90 b/time_manager/time_manager.F90
index 386711751e..412fd4d126 100644
--- a/time_manager/time_manager.F90
+++ b/time_manager/time_manager.F90
@@ -89,8 +89,7 @@ module time_manager_mod
 !    contains three PRIVATE variables: days, seconds and ticks.
 ! </DATA>
 
-#include <fms_platform.h>
-
+use platform_mod, only: r8_kind
 use constants_mod, only: rseconds_per_day=>seconds_per_day
 use fms_mod, only: error_mesg, FATAL, WARNING, write_version_number, stdout
 
@@ -1255,7 +1254,7 @@ end subroutine time_assignment
 
 function time_type_to_real(time)
 
-real(DOUBLE_KIND)           :: time_type_to_real
+real(kind=r8_kind)           :: time_type_to_real
 type(time_type), intent(in) :: time
 
 if(.not.module_is_initialized) call time_manager_init
diff --git a/topography/Makefile.am b/topography/Makefile.am
index 72140c1990..91c0363f08 100644
--- a/topography/Makefile.am
+++ b/topography/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build these uninstalled convenience libraries.
 noinst_LTLIBRARIES = libgaussian_topog.la libtopography.la
diff --git a/tracer_manager/Makefile.am b/tracer_manager/Makefile.am
index 5dcd722bf2..96f98a1824 100644
--- a/tracer_manager/Makefile.am
+++ b/tracer_manager/Makefile.am
@@ -24,6 +24,7 @@
 
 # Include .h and .mod files.
 AM_CPPFLAGS = -I${top_builddir}/.mod -I${top_srcdir}/include
+AM_CPPFLAGS += -I${top_builddir}/platform
 
 # Build this uninstalled convenience library.
 noinst_LTLIBRARIES = libtracer_manager.la