diff --git a/cicecore/cicedynB/dynamics/ice_dyn_evp.F90 b/cicecore/cicedynB/dynamics/ice_dyn_evp.F90
index b796488b7..276c8bb79 100644
--- a/cicecore/cicedynB/dynamics/ice_dyn_evp.F90
+++ b/cicecore/cicedynB/dynamics/ice_dyn_evp.F90
@@ -34,7 +34,7 @@
       module ice_dyn_evp
 
       use ice_kinds_mod
-      use ice_communicate, only: my_task
+      use ice_communicate, only: my_task, master_task
       use ice_constants, only: field_loc_center, field_loc_NEcorner, &
           field_type_scalar, field_type_vector
       use ice_constants, only: c0, p027, p055, p111, p166, &
@@ -88,14 +88,14 @@ subroutine evp (dt)
           stress12_1, stress12_2, stress12_3, stress12_4
       use ice_grid, only: tmask, umask, dxt, dyt, dxhy, dyhx, cxp, cyp, cxm, cym, &
           tarear, uarear, tinyarea, to_ugrid, t2ugrid_vector, u2tgrid_vector, &
-          grid_type, HTE, HTN
+          grid_type
       use ice_state, only: aice, vice, vsno, uvel, vvel, divu, shear, &
           aice_init, aice0, aicen, vicen, strength
       use ice_timers, only: timer_dynamics, timer_bound, &
           ice_timer_start, ice_timer_stop, timer_evp_1d, timer_evp_2d
       use ice_dyn_evp_1d, only: ice_dyn_evp_1d_copyin, ice_dyn_evp_1d_kernel, &
           ice_dyn_evp_1d_copyout
-      use ice_dyn_shared, only: kevp_kernel, stack_velocity_field, unstack_velocity_field
+      use ice_dyn_shared, only: evp_algorithm, stack_velocity_field, unstack_velocity_field
 
       real (kind=dbl_kind), intent(in) :: &
          dt      ! time step
@@ -331,7 +331,7 @@ subroutine evp (dt)
       
       if (seabed_stress) then
 
-       !$OMP PARALLEL DO PRIVATE(iblk)
+         !$OMP PARALLEL DO PRIVATE(iblk)
          do iblk = 1, nblocks
 
             if ( seabed_stress_method == 'LKD' ) then
@@ -351,118 +351,115 @@ subroutine evp (dt)
                                                hwater(:,:,iblk), Tbu(:,:,iblk))
             endif
          
-       enddo
+         enddo
        !$OMP END PARALLEL DO
       endif
+
       call ice_timer_start(timer_evp_2d)
-      if (kevp_kernel > 0) then
-        if (first_time .and. my_task == 0) then
-          write(nu_diag,'(2a,i6)') subname,' Entering kevp_kernel version ',kevp_kernel
-          first_time = .false.
-        endif
-        if (trim(grid_type) == 'tripole') then
-          call abort_ice(trim(subname)//' Kernel not tested on tripole grid. Set kevp_kernel=0')
-        endif
-        call ice_dyn_evp_1d_copyin(                                                &
-          nx_block,ny_block,nblocks,nx_global+2*nghost,ny_global+2*nghost, &
-          HTE,HTN,                                                      &
-!v1          dxhy,dyhx,cyp,cxp,cym,cxm,tinyarea,                           &
-!v1          waterx,watery,                                                &
-          icetmask, iceumask,                                           &
-          cdn_ocn,aiu,uocn,vocn,forcex,forcey,Tbu,        &
-          umassdti,fm,uarear,tarear,strintx,strinty,uvel_init,vvel_init,&
-          strength,uvel,vvel,dxt,dyt,                                   &
-          stressp_1 ,stressp_2, stressp_3, stressp_4,                   &
-          stressm_1 ,stressm_2, stressm_3, stressm_4,                   &
-          stress12_1,stress12_2,stress12_3,stress12_4                   )
-        if (kevp_kernel == 2) then
-          call ice_timer_start(timer_evp_1d)
-          call ice_dyn_evp_1d_kernel()
-          call ice_timer_stop(timer_evp_1d)
-!v1        else if (kevp_kernel == 1) then
-!v1          call evp_kernel_v1()
-        else
-          if (my_task == 0) write(nu_diag,*) subname,' ERROR: kevp_kernel = ',kevp_kernel
-          call abort_ice(subname//' kevp_kernel not supported.')
-        endif
-        call ice_dyn_evp_1d_copyout(                                               &
-          nx_block,ny_block,nblocks,nx_global+2*nghost,ny_global+2*nghost,&
-!strocn          uvel,vvel, strocnx,strocny, strintx,strinty,                  &
-          uvel,vvel, strintx,strinty,                                   &
-          stressp_1, stressp_2, stressp_3, stressp_4,                   &
-          stressm_1, stressm_2, stressm_3, stressm_4,                   &
-          stress12_1,stress12_2,stress12_3,stress12_4,                  &
-          divu,rdg_conv,rdg_shear,shear,taubx,tauby                     )
-
-      else ! kevp_kernel == 0 (Standard CICE)
-
-      do ksub = 1,ndte        ! subcycling
-
-      !-----------------------------------------------------------------
-      ! stress tensor equation, total surface stress
-      !-----------------------------------------------------------------
-
-         !$TCXOMP PARALLEL DO PRIVATE(iblk,strtmp)
-         do iblk = 1, nblocks
 
-!            if (trim(yield_curve) == 'ellipse') then
-               call stress (nx_block,             ny_block,             & 
-                            ksub,                 icellt(iblk),         & 
-                            indxti      (:,iblk), indxtj      (:,iblk), & 
-                            uvel      (:,:,iblk), vvel      (:,:,iblk), &     
-                            dxt       (:,:,iblk), dyt       (:,:,iblk), & 
-                            dxhy      (:,:,iblk), dyhx      (:,:,iblk), & 
-                            cxp       (:,:,iblk), cyp       (:,:,iblk), & 
-                            cxm       (:,:,iblk), cym       (:,:,iblk), & 
-                            tarear    (:,:,iblk), tinyarea  (:,:,iblk), & 
-                            strength  (:,:,iblk),                       & 
-                            stressp_1 (:,:,iblk), stressp_2 (:,:,iblk), & 
-                            stressp_3 (:,:,iblk), stressp_4 (:,:,iblk), & 
-                            stressm_1 (:,:,iblk), stressm_2 (:,:,iblk), & 
-                            stressm_3 (:,:,iblk), stressm_4 (:,:,iblk), & 
-                            stress12_1(:,:,iblk), stress12_2(:,:,iblk), & 
-                            stress12_3(:,:,iblk), stress12_4(:,:,iblk), & 
-                            shear     (:,:,iblk), divu      (:,:,iblk), & 
-                            rdg_conv  (:,:,iblk), rdg_shear (:,:,iblk), & 
-                            strtmp    (:,:,:) )
-!            endif               ! yield_curve
-
-      !-----------------------------------------------------------------
-      ! momentum equation
-      !-----------------------------------------------------------------
-
-            call stepu (nx_block,            ny_block,           &
-                        icellu       (iblk), Cdn_ocn (:,:,iblk), & 
-                        indxui     (:,iblk), indxuj    (:,iblk), &
-                        ksub,                                    &
-                        aiu      (:,:,iblk), strtmp  (:,:,:),    & 
-                        uocn     (:,:,iblk), vocn    (:,:,iblk), &     
-                        waterx   (:,:,iblk), watery  (:,:,iblk), & 
-                        forcex   (:,:,iblk), forcey  (:,:,iblk), & 
-                        umassdti (:,:,iblk), fm      (:,:,iblk), & 
-                        uarear   (:,:,iblk),                     & 
-                        strintx  (:,:,iblk), strinty (:,:,iblk), &
-                        taubx    (:,:,iblk), tauby   (:,:,iblk), & 
-                        uvel_init(:,:,iblk), vvel_init(:,:,iblk),&
-                        uvel     (:,:,iblk), vvel    (:,:,iblk), &
-                        Tbu      (:,:,iblk))
-         enddo
-         !$TCXOMP END PARALLEL DO
+      if (evp_algorithm == "shared_mem_1d" ) then
 
-         call stack_velocity_field(uvel, vvel, fld2)
-         call ice_timer_start(timer_bound)
-         if (maskhalo_dyn) then
-            call ice_HaloUpdate (fld2,               halo_info_mask, &
-                                 field_loc_NEcorner, field_type_vector)
-         else
-            call ice_HaloUpdate (fld2,               halo_info, &
-                                 field_loc_NEcorner, field_type_vector)
+         if (first_time .and. my_task == master_task) then
+            write(nu_diag,'(3a)') subname,' Entering evp_algorithm version ',evp_algorithm
+            first_time = .false.
          endif
-         call ice_timer_stop(timer_bound)
-         call unstack_velocity_field(fld2, uvel, vvel)
+         if (trim(grid_type) == 'tripole') then
+            call abort_ice(trim(subname)//' &
+               & Kernel not tested on tripole grid. Set evp_algorithm=standard_2d')
+         endif
+
+         call ice_dyn_evp_1d_copyin(                                                &
+            nx_block,ny_block,nblocks,nx_global+2*nghost,ny_global+2*nghost, &
+            icetmask, iceumask,                                           &
+            cdn_ocn,aiu,uocn,vocn,forcex,forcey,Tbu,        &
+            umassdti,fm,uarear,tarear,strintx,strinty,uvel_init,vvel_init,&
+            strength,uvel,vvel,dxt,dyt,                                   &
+            stressp_1 ,stressp_2, stressp_3, stressp_4,                   &
+            stressm_1 ,stressm_2, stressm_3, stressm_4,                   &
+            stress12_1,stress12_2,stress12_3,stress12_4                   )
+         call ice_timer_start(timer_evp_1d)
+         call ice_dyn_evp_1d_kernel()
+         call ice_timer_stop(timer_evp_1d)
+         call ice_dyn_evp_1d_copyout(                                      &
+            nx_block,ny_block,nblocks,nx_global+2*nghost,ny_global+2*nghost,&
+!strocn            uvel,vvel, strocnx,strocny, strintx,strinty,                  &
+            uvel,vvel, strintx,strinty,                                   &
+            stressp_1, stressp_2, stressp_3, stressp_4,                   &
+            stressm_1, stressm_2, stressm_3, stressm_4,                   &
+            stress12_1,stress12_2,stress12_3,stress12_4,                  &
+            divu,rdg_conv,rdg_shear,shear,taubx,tauby                     )
+
+      else ! evp_algorithm == standard_2d (Standard CICE)
+
+         do ksub = 1,ndte        ! subcycling
+
+         !-----------------------------------------------------------------
+         ! stress tensor equation, total surface stress
+         !-----------------------------------------------------------------
+
+            !$TCXOMP PARALLEL DO PRIVATE(iblk,strtmp)
+            do iblk = 1, nblocks
+
+!               if (trim(yield_curve) == 'ellipse') then
+                  call stress (nx_block,             ny_block,             &
+                               ksub,                 icellt(iblk),         &
+                               indxti      (:,iblk), indxtj      (:,iblk), &
+                               uvel      (:,:,iblk), vvel      (:,:,iblk), &
+                               dxt       (:,:,iblk), dyt       (:,:,iblk), &
+                               dxhy      (:,:,iblk), dyhx      (:,:,iblk), &
+                               cxp       (:,:,iblk), cyp       (:,:,iblk), &
+                               cxm       (:,:,iblk), cym       (:,:,iblk), &
+                               tarear    (:,:,iblk), tinyarea  (:,:,iblk), &
+                               strength  (:,:,iblk),                       &
+                               stressp_1 (:,:,iblk), stressp_2 (:,:,iblk), &
+                               stressp_3 (:,:,iblk), stressp_4 (:,:,iblk), &
+                               stressm_1 (:,:,iblk), stressm_2 (:,:,iblk), &
+                               stressm_3 (:,:,iblk), stressm_4 (:,:,iblk), &
+                               stress12_1(:,:,iblk), stress12_2(:,:,iblk), &
+                               stress12_3(:,:,iblk), stress12_4(:,:,iblk), &
+                               shear     (:,:,iblk), divu      (:,:,iblk), &
+                               rdg_conv  (:,:,iblk), rdg_shear (:,:,iblk), &
+                               strtmp    (:,:,:) )
+!               endif               ! yield_curve
+
+         !-----------------------------------------------------------------
+         ! momentum equation
+         !-----------------------------------------------------------------
+
+               call stepu (nx_block,            ny_block,           &
+                           icellu       (iblk), Cdn_ocn (:,:,iblk), &
+                           indxui     (:,iblk), indxuj    (:,iblk), &
+                           ksub,                                    &
+                           aiu      (:,:,iblk), strtmp  (:,:,:),    &
+                           uocn     (:,:,iblk), vocn    (:,:,iblk), &
+                           waterx   (:,:,iblk), watery  (:,:,iblk), &
+                           forcex   (:,:,iblk), forcey  (:,:,iblk), &
+                           umassdti (:,:,iblk), fm      (:,:,iblk), &
+                           uarear   (:,:,iblk),                     &
+                           strintx  (:,:,iblk), strinty (:,:,iblk), &
+                           taubx    (:,:,iblk), tauby   (:,:,iblk), &
+                           uvel_init(:,:,iblk), vvel_init(:,:,iblk),&
+                           uvel     (:,:,iblk), vvel    (:,:,iblk), &
+                           Tbu      (:,:,iblk))
+
+            enddo
+            !$TCXOMP END PARALLEL DO
+
+            call stack_velocity_field(uvel, vvel, fld2)
+            call ice_timer_start(timer_bound)
+            if (maskhalo_dyn) then
+               call ice_HaloUpdate (fld2,               halo_info_mask, &
+                                    field_loc_NEcorner, field_type_vector)
+            else
+               call ice_HaloUpdate (fld2,               halo_info, &
+                                    field_loc_NEcorner, field_type_vector)
+            endif
+            call ice_timer_stop(timer_bound)
+            call unstack_velocity_field(fld2, uvel, vvel)
          
-      enddo                     ! subcycling
-      endif  ! kevp_kernel
+         enddo                     ! subcycling
+      endif  ! evp_algorithm
+
       call ice_timer_stop(timer_evp_2d)
 
       deallocate(fld2)
diff --git a/cicecore/cicedynB/dynamics/ice_dyn_evp_1d.F90 b/cicecore/cicedynB/dynamics/ice_dyn_evp_1d.F90
old mode 100644
new mode 100755
index 78469cc86..c691453cb
--- a/cicecore/cicedynB/dynamics/ice_dyn_evp_1d.F90
+++ b/cicecore/cicedynB/dynamics/ice_dyn_evp_1d.F90
@@ -1,2135 +1,1941 @@
-! ice_dyn_evp_1d
+!=======================================================================
 !
-! Contained 3 Fortran modules,
-!   * dmi_omp
-!   * bench_v2
-!   * ice_dyn_evp_1d
-! These were merged into one module, ice_dyn_evp_1d to support some
-!   coupled build systems.
+! Elastic-viscous-plastic sea ice dynamics model (1D implementations)
+! Computes ice velocity and deformation
 !
-! Modules used for:
-!   * convert 2D arrays into 1D vectors
-!   * Do stress/stepu/halo_update interations
-!   * convert 1D vectors into 2D matrices
-!
-!   Call from ice_dyn_evp.F90:
-!     call ice_dyn_evp_1d_copyin(...)
-!     call ice_dyn_evp_1d_kernel()
-!     call ice_dyn_evp_1d_copyout(...)
-!
-! * REAL4 internal version:
-!     mv evp_kernel1d.F90 evp_kernel1d_r8.F90
-!     cat evp_kernel1d_r8.F90 | sed s/DBL_KIND/REAL_KIND/g > evp_kernel1d.F90
-!
-! * !v1 : a) "dxhy,dyhx,cyp,cxp,cym,cxm,tinyarea" input variables is replaced by
-!            "HTE,HTN"->"HTE,HTN,HTEm1,HTNm1" and variables are calculated in-line
-!         b) "waterx,watery" is calculated using existing input "uocn,vocn"
-!
-! Jacob Weismann Poulsen (JWP), Mads Hvid Ribergaard (MHRI), DMI
-!===============================================================================
+! authors: Jacob Weismann Poulsen, DMI
+!          Mads Hvid Ribergaard, DMI
 
-!===============================================================================
-  
-!-- One dimension representation of EVP 2D arrays used for EVP kernels
 module ice_dyn_evp_1d
 
-  use ice_kinds_mod
-  use ice_fileunits, only: nu_diag
-  use ice_exit, only: abort_ice
-  use icepack_intfc, only: icepack_query_parameters
-  use icepack_intfc, only: icepack_warnings_flush, icepack_warnings_aborted
-
-  implicit none
-  private
-  public :: ice_dyn_evp_1d_copyin, ice_dyn_evp_1d_copyout, ice_dyn_evp_1d_kernel
-
-  interface ice_dyn_evp_1d_copyin
-!    module procedure evp_copyin_v1
-    module procedure evp_copyin_v2
-  end interface 
-
-  interface ice_dyn_evp_1d_kernel
-!    module procedure evp_kernel_v1
-    module procedure evp_kernel_v2
-  end interface 
-
-  interface ice_dyn_evp_1d_copyout
-    module procedure evp_copyout
-  end interface 
-
-  interface convert_2d_1d
-!    module procedure convert_2d_1d_v1
-    module procedure convert_2d_1d_v2
-  end interface 
-
-  integer(kind=int_kind) ::                                        &
-    NA_len, NAVEL_len
-  logical(kind=log_kind), dimension(:), allocatable ::             &
-    skipucell
-  integer(kind=int_kind), dimension(:), allocatable ::             &
-    ee,ne,se,nw,sw,sse,indi,indj,indij , halo_parent
-  real (kind=dbl_kind), dimension(:), allocatable ::               &
-    cdn_ocn,aiu,uocn,vocn,forcex,forcey,Tbu,tarear,                &
-    umassdti,fm,uarear,strintx,strinty,uvel_init,vvel_init
-  real (kind=dbl_kind), dimension(:), allocatable ::               &
-    strength,uvel,vvel,dxt,dyt,                                    &
-!v1    dxhy,dyhx,cyp,cxp,cym,cxm,tinyarea,                            &
-!v1    waterx,watery,                                                 &
-    stressp_1, stressp_2, stressp_3, stressp_4,                    &
-    stressm_1, stressm_2, stressm_3, stressm_4,                    &
-    stress12_1,stress12_2,stress12_3,stress12_4,                   &
-    divu,rdg_conv,rdg_shear,shear,taubx,tauby
-  real (kind=DBL_KIND), dimension(:), allocatable ::               &
-    str1, str2, str3, str4, str5, str6, str7, str8
-  real (kind=dbl_kind), dimension(:), allocatable ::               &
-    HTE,HTN,                                                       &
-    HTEm1,HTNm1
-  logical(kind=log_kind),parameter :: dbug = .false.
-
-
-!--- dmi_omp ---------------------------
-  interface domp_get_domain
-    module procedure domp_get_domain_rlu
-  end interface 
-
-  INTEGER, PARAMETER :: JPIM = SELECTED_INT_KIND(9)
-  integer(int_kind) :: domp_iam, domp_nt
+   use ice_kinds_mod
+   use ice_fileunits, only : nu_diag
+   use ice_exit, only : abort_ice
+   use icepack_intfc, only : icepack_query_parameters
+   use icepack_intfc, only : icepack_warnings_flush, &
+       icepack_warnings_aborted
 
+   implicit none
+   private
+   public :: ice_dyn_evp_1d_copyin, ice_dyn_evp_1d_copyout, &
+      ice_dyn_evp_1d_kernel
+
+   integer(kind=int_kind) :: NA_len, NAVEL_len, domp_iam, domp_nt
 #if defined (_OPENMP)
-  ! Please note, this constant will create a compiler info for a constant
-  ! expression in IF statements:
-  real(kind=dbl_kind) :: rdomp_iam, rdomp_nt
-  !$OMP THREADPRIVATE(domp_iam,domp_nt,rdomp_iam,rdomp_nt) 
+   real(kind=dbl_kind) :: rdomp_iam, rdomp_nt
+   !$OMP THREADPRIVATE(domp_iam, domp_nt, rdomp_iam, rdomp_nt)
 #endif
-!--- dmi_omp ---------------------------
-
-!--- bench_v2 --------------------------
-  interface evp1d_stress
-    module procedure stress_i
-    module procedure stress_l
-  end interface 
-  interface evp1d_stepu
-    module procedure stepu_iter
-    module procedure stepu_last
-  end interface 
-!--- bench_v2 --------------------------
+   logical(kind=log_kind), dimension(:), allocatable :: skiptcell, skipucell
+   integer(kind=int_kind), dimension(:), allocatable :: ee, ne, se, &
+      nw, sw, sse, indi, indj, indij, halo_parent
+   real(kind=dbl_kind), dimension(:), allocatable :: cdn_ocn, aiu, &
+      uocn, vocn, forcex, forcey, Tbu, tarear, umassdti, fm, uarear, &
+      strintx, strinty, uvel_init, vvel_init, strength, uvel, vvel, &
+      dxt, dyt, stressp_1, stressp_2, stressp_3, stressp_4, stressm_1, &
+      stressm_2, stressm_3, stressm_4, stress12_1, stress12_2, &
+      stress12_3, stress12_4, divu, rdg_conv, rdg_shear, shear, taubx, &
+      tauby, str1, str2, str3, str4, str5, str6, str7, str8, HTE, HTN, &
+      HTEm1, HTNm1
+   integer, parameter :: JPIM = selected_int_kind(9)
+
+   interface evp1d_stress
+      module procedure stress_iter
+      module procedure stress_last
+   end interface
+
+   interface evp1d_stepu
+      module procedure stepu_iter
+      module procedure stepu_last
+   end interface
+
+!=======================================================================
+
+contains
+
+!=======================================================================
+
+   subroutine domp_init
+#if defined (_OPENMP)
 
-  contains
+      use omp_lib, only : omp_get_thread_num, omp_get_num_threads
+#endif
 
-!===============================================================================
-!former module dmi_omp
+      implicit none
 
-  subroutine domp_init(nt_out)
+      character(len=*), parameter :: subname = '(domp_init)'
 
+      !$OMP PARALLEL DEFAULT(none)
 #if defined (_OPENMP)
-    use omp_lib, only : omp_get_thread_num, omp_get_num_threads
+      domp_iam = omp_get_thread_num()
+      rdomp_iam = real(domp_iam, dbl_kind)
+      domp_nt = omp_get_num_threads()
+      rdomp_nt = real(domp_nt, dbl_kind)
+#else
+      domp_iam = 0
+      domp_nt = 1
 #endif
+      !$OMP END PARALLEL
 
-    integer(int_kind), intent(out) :: nt_out
+   end subroutine domp_init
 
-    character(len=*), parameter :: subname = '(domp_init)'
-    !---------------------------------------
+!=======================================================================
 
-    !$OMP PARALLEL DEFAULT(none)
+   subroutine domp_get_domain(lower, upper, d_lower, d_upper)
 #if defined (_OPENMP)
-    domp_iam  = omp_get_thread_num()
-    rdomp_iam = real(domp_iam,dbl_kind)
-    domp_nt   = omp_get_num_threads()
-    rdomp_nt  = real(domp_nt,dbl_kind)
-#else
-    domp_iam  = 0
-    domp_nt   = 1
-#endif
-    !$OMP END PARALLEL
-
-    if (dbug) then
-#if defined (_OPENACC)
-      write(nu_diag,'(2a)') subname,' Build with openACC support'
-!#elif defined (_OPENMP)
-!      write(nu_diag,'(2a)') subname,' Build with openMP support'
-!#else
-!      write(nu_diag,'(2a)') subname,' Build without openMP and openACC support'
+
+      use omp_lib, only : omp_in_parallel
+      use ice_constants, only : p5
 #endif
 
-      !- echo #threads:
-      if (domp_nt > 1) then
-        write(nu_diag,'(2a,i5,a)') subname,' Running openMP with ', domp_nt, ' threads'
-      else
+      implicit none
+
+      integer(kind=JPIM), intent(in) :: lower, upper
+      integer(kind=JPIM), intent(out) :: d_lower, d_upper
+
+      ! local variables
 #if defined (_OPENMP)
-        write(nu_diag,'(2a)') subname,' Running openMP with a single thread'
-#else
-        write(nu_diag,'(2a)') subname,' Running without openMP'
-#endif
-      endif
-    endif
 
-    !- return value of #threads:
-    nt_out = domp_nt
+      real(kind=dbl_kind) :: dlen
+#endif
 
-  end subroutine domp_init
- 
-!----------------------------------------------------------------------------
+      character(len=*), parameter :: subname = '(domp_get_domain)'
 
-  subroutine domp_get_domain_rlu(lower,upper,d_lower,d_upper)
+      ! proper action in "null" case
+      if (upper <= 0 .or. upper < lower) then
+         d_lower = 0
+         d_upper = -1
+         return
+      end if
 
+      ! proper action in serial case
+      d_lower = lower
+      d_upper = upper
 #if defined (_OPENMP)
-    use omp_lib,   only : omp_in_parallel
-    use ice_constants, only: p5
+
+      if (omp_in_parallel()) then
+         dlen = real((upper - lower + 1), dbl_kind)
+         d_lower = lower + floor(((rdomp_iam * dlen + p5) / rdomp_nt), JPIM)
+         d_upper = lower - 1 + floor(((rdomp_iam * dlen + dlen + p5) / rdomp_nt), JPIM)
+      end if
 #endif
 
-    integer(KIND=JPIM), intent(in)  :: lower,upper
-    integer(KIND=JPIM), intent(out) :: d_lower,d_upper
+   end subroutine domp_get_domain
+
+!=======================================================================
+
+   subroutine stress_iter(NA_len, ee, ne, se, lb, ub, uvel, vvel, dxt, &
+      dyt, hte, htn, htem1, htnm1, strength, stressp_1, stressp_2, &
+      stressp_3, stressp_4, stressm_1, stressm_2, stressm_3, &
+      stressm_4, stress12_1, stress12_2, stress12_3, stress12_4, str1, &
+      str2, str3, str4, str5, str6, str7, str8, skiptcell)
+
+      use ice_kinds_mod
+      use ice_constants, only : p027, p055, p111, p166, p222, p25, &
+          p333, p5, c1p5, c1
+      use ice_dyn_shared, only : ecci, denom1, arlx1i, Ktens, revp
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: NA_len, lb, ub
+      integer(kind=int_kind), dimension(:), intent(in), contiguous :: &
+         ee, ne, se
+      real(kind=dbl_kind), dimension(:), intent(in), contiguous :: &
+         strength, uvel, vvel, dxt, dyt, hte, htn, htem1, htnm1
+      logical(kind=log_kind), intent(in), dimension(:) :: skiptcell
+      real(kind=dbl_kind), dimension(:), intent(inout), contiguous :: &
+         stressp_1, stressp_2, stressp_3, stressp_4, stressm_1, &
+         stressm_2, stressm_3, stressm_4, stress12_1, stress12_2, &
+         stress12_3, stress12_4
+      real(kind=dbl_kind), dimension(:), intent(out), contiguous :: &
+         str1, str2, str3, str4, str5, str6, str7, str8
+
+      ! local variables
+
+      integer(kind=int_kind) :: iw, il, iu
+      real(kind=dbl_kind) :: puny, divune, divunw, divuse, divusw, &
+         tensionne, tensionnw, tensionse, tensionsw, shearne, shearnw, &
+         shearse, shearsw, Deltane, Deltanw, Deltase, Deltasw, c0ne, &
+         c0nw, c0se, c0sw, c1ne, c1nw, c1se, c1sw, ssigpn, ssigps, &
+         ssigpe, ssigpw, ssigmn, ssigms, ssigme, ssigmw, ssig12n, &
+         ssig12s, ssig12e, ssig12w, ssigp1, ssigp2, ssigm1, ssigm2, &
+         ssig121, ssig122, csigpne, csigpnw, csigpse, csigpsw, &
+         csigmne, csigmnw, csigmse, csigmsw, csig12ne, csig12nw, &
+         csig12se, csig12sw, str12ew, str12we, str12ns, str12sn, &
+         strp_tmp, strm_tmp, tmp_uvel_ee, tmp_vvel_se, tmp_vvel_ee, &
+         tmp_vvel_ne, tmp_uvel_ne, tmp_uvel_se, dxhy, dyhx, cxp, cyp, &
+         cxm, cym, tinyarea,tmparea
+
+      character(len=*), parameter :: subname = '(stress_iter)'
+
+      call icepack_query_parameters(puny_out=puny)
+      call icepack_warnings_flush(nu_diag)
+      if (icepack_warnings_aborted()) then
+         call abort_ice(error_message=subname, file=__FILE__, &
+            line=__LINE__)
+      end if
 
-#if defined (_OPENMP)
-    !-- local variables
-    real(kind=dbl_kind)    :: dlen
+#ifdef _OPENACC
+      !$acc parallel &
+      !$acc present(ee, ne, se, strength, uvel, vvel, dxt, dyt, hte, &
+      !$acc    htn, htem1, htnm1, str1, str2, str3, str4, str5, str6, &
+      !$acc    str7, str8, stressp_1, stressp_2, stressp_3, stressp_4, &
+      !$acc    stressm_1, stressm_2, stressm_3, stressm_4, stress12_1, &
+      !$acc    stress12_2, stress12_3, stress12_4, skiptcell)
+      !$acc loop
+      do iw = 1, NA_len
+#else
+      call domp_get_domain(lb, ub, il, iu)
+      do iw = il, iu
 #endif
 
-    character(len=*), parameter :: subname = '(domp_get_domain_rlu)'
-    !---------------------------------------
+         if (skiptcell(iw)) cycle
+
+         tmparea = dxt(iw) * dyt(iw) ! necessary to split calc of tinyarea. Otherwize not binary identical
+         tinyarea =  puny * tmparea
+         dxhy     = p5 * (hte(iw) - htem1(iw))
+         dyhx     = p5 * (htn(iw) - htnm1(iw))
+         cxp      = c1p5 * htn(iw) - p5 * htnm1(iw)
+         cyp      = c1p5 * hte(iw) - p5 * htem1(iw)
+         cxm      = -(c1p5 * htnm1(iw) - p5 * htn(iw))
+         cym      = -(c1p5 * htem1(iw) - p5 * hte(iw))
+
+         !--------------------------------------------------------------
+         ! strain rates
+         ! NOTE: these are actually strain rates * area (m^2/s)
+         !--------------------------------------------------------------
+
+         tmp_uvel_ne = uvel(ne(iw))
+         tmp_uvel_se = uvel(se(iw))
+         tmp_uvel_ee = uvel(ee(iw))
+
+         tmp_vvel_ee = vvel(ee(iw))
+         tmp_vvel_se = vvel(se(iw))
+         tmp_vvel_ne = vvel(ne(iw))
+         ! divergence = e_11 + e_22
+         divune = cyp * uvel(iw)    - dyt(iw) * tmp_uvel_ee &
+                + cxp * vvel(iw)    - dxt(iw) * tmp_vvel_se
+         divunw = cym * tmp_uvel_ee + dyt(iw) * uvel(iw) &
+                + cxp * tmp_vvel_ee - dxt(iw) * tmp_vvel_ne
+         divusw = cym * tmp_uvel_ne + dyt(iw) * tmp_uvel_se &
+                + cxm * tmp_vvel_ne + dxt(iw) * tmp_vvel_ee
+         divuse = cyp * tmp_uvel_se - dyt(iw) * tmp_uvel_ne &
+                + cxm * tmp_vvel_se + dxt(iw) * vvel(iw)
+
+         ! tension strain rate = e_11 - e_22
+         tensionne = -cym * uvel(iw)    - dyt(iw) * tmp_uvel_ee &
+                   +  cxm * vvel(iw)    + dxt(iw) * tmp_vvel_se
+         tensionnw = -cyp * tmp_uvel_ee + dyt(iw) * uvel(iw) &
+                   +  cxm * tmp_vvel_ee + dxt(iw) * tmp_vvel_ne
+         tensionsw = -cyp * tmp_uvel_ne + dyt(iw) * tmp_uvel_se &
+                   +  cxp * tmp_vvel_ne - dxt(iw) * tmp_vvel_ee
+         tensionse = -cym * tmp_uvel_se - dyt(iw) * tmp_uvel_ne &
+                   +  cxp * tmp_vvel_se - dxt(iw) * vvel(iw)
+
+         ! shearing strain rate = 2 * e_12
+         shearne = -cym * vvel(iw)    - dyt(iw) * tmp_vvel_ee &
+                 -  cxm * uvel(iw)    - dxt(iw) * tmp_uvel_se
+         shearnw = -cyp * tmp_vvel_ee + dyt(iw) * vvel(iw) &
+                 -  cxm * tmp_uvel_ee - dxt(iw) * tmp_uvel_ne
+         shearsw = -cyp * tmp_vvel_ne + dyt(iw) * tmp_vvel_se &
+                 -  cxp * tmp_uvel_ne + dxt(iw) * tmp_uvel_ee
+         shearse = -cym * tmp_vvel_se - dyt(iw) * tmp_vvel_ne &
+                 -  cxp * tmp_uvel_se + dxt(iw) * uvel(iw)
+
+         ! Delta (in the denominator of zeta and eta)
+         Deltane = sqrt(divune**2 + ecci * (tensionne**2 + shearne**2))
+         Deltanw = sqrt(divunw**2 + ecci * (tensionnw**2 + shearnw**2))
+         Deltasw = sqrt(divusw**2 + ecci * (tensionsw**2 + shearsw**2))
+         Deltase = sqrt(divuse**2 + ecci * (tensionse**2 + shearse**2))
+
+         !--------------------------------------------------------------
+         ! replacement pressure/Delta (kg/s)
+         ! save replacement pressure for principal stress calculation
+         !--------------------------------------------------------------
+
+         c0ne = strength(iw) / max(Deltane, tinyarea)
+         c0nw = strength(iw) / max(Deltanw, tinyarea)
+         c0sw = strength(iw) / max(Deltasw, tinyarea)
+         c0se = strength(iw) / max(Deltase, tinyarea)
+
+         c1ne = c0ne * arlx1i
+         c1nw = c0nw * arlx1i
+         c1sw = c0sw * arlx1i
+         c1se = c0se * arlx1i
+
+         c0ne = c1ne * ecci
+         c0nw = c1nw * ecci
+         c0sw = c1sw * ecci
+         c0se = c1se * ecci
+
+         !--------------------------------------------------------------
+         ! the stresses (kg/s^2)
+         ! (1) northeast, (2) northwest, (3) southwest, (4) southeast
+         !--------------------------------------------------------------
+
+         stressp_1(iw) = (stressp_1(iw) * (c1 - arlx1i * revp) &
+                       + c1ne * (divune * (c1 + Ktens) - Deltane * (c1 - Ktens))) * denom1
+         stressp_2(iw) = (stressp_2(iw) * (c1 - arlx1i * revp) &
+                       + c1nw * (divunw * (c1 + Ktens) - Deltanw * (c1 - Ktens))) * denom1
+         stressp_3(iw) = (stressp_3(iw) * (c1 - arlx1i * revp) &
+                       + c1sw * (divusw * (c1 + Ktens) - Deltasw * (c1 - Ktens))) * denom1
+         stressp_4(iw) = (stressp_4(iw) * (c1 - arlx1i * revp) &
+                       + c1se * (divuse * (c1 + Ktens) - Deltase * (c1 - Ktens))) * denom1
+
+         stressm_1(iw) = (stressm_1(iw) * (c1 - arlx1i * revp) + c0ne * tensionne * (c1 + Ktens)) * denom1
+         stressm_2(iw) = (stressm_2(iw) * (c1 - arlx1i * revp) + c0nw * tensionnw * (c1 + Ktens)) * denom1
+         stressm_3(iw) = (stressm_3(iw) * (c1 - arlx1i * revp) + c0sw * tensionsw * (c1 + Ktens)) * denom1
+         stressm_4(iw) = (stressm_4(iw) * (c1 - arlx1i * revp) + c0se * tensionse * (c1 + Ktens)) * denom1
+
+         stress12_1(iw) = (stress12_1(iw) * (c1 - arlx1i * revp) + c0ne * shearne * p5 * (c1 + Ktens)) * denom1
+         stress12_2(iw) = (stress12_2(iw) * (c1 - arlx1i * revp) + c0nw * shearnw * p5 * (c1 + Ktens)) * denom1
+         stress12_3(iw) = (stress12_3(iw) * (c1 - arlx1i * revp) + c0sw * shearsw * p5 * (c1 + Ktens)) * denom1
+         stress12_4(iw) = (stress12_4(iw) * (c1 - arlx1i * revp) + c0se * shearse * p5 * (c1 + Ktens)) * denom1
+
+         !--------------------------------------------------------------
+         ! combinations of the stresses for the momentum equation
+         ! (kg/s^2)
+         !--------------------------------------------------------------
+
+         ssigpn =  stressp_1(iw) + stressp_2(iw)
+         ssigps =  stressp_3(iw) + stressp_4(iw)
+         ssigpe =  stressp_1(iw) + stressp_4(iw)
+         ssigpw =  stressp_2(iw) + stressp_3(iw)
+         ssigp1 = (stressp_1(iw) + stressp_3(iw)) * p055
+         ssigp2 = (stressp_2(iw) + stressp_4(iw)) * p055
+
+         ssigmn =  stressm_1(iw) + stressm_2(iw)
+         ssigms =  stressm_3(iw) + stressm_4(iw)
+         ssigme =  stressm_1(iw) + stressm_4(iw)
+         ssigmw =  stressm_2(iw) + stressm_3(iw)
+         ssigm1 = (stressm_1(iw) + stressm_3(iw)) * p055
+         ssigm2 = (stressm_2(iw) + stressm_4(iw)) * p055
+
+         ssig12n =  stress12_1(iw) + stress12_2(iw)
+         ssig12s =  stress12_3(iw) + stress12_4(iw)
+         ssig12e =  stress12_1(iw) + stress12_4(iw)
+         ssig12w =  stress12_2(iw) + stress12_3(iw)
+         ssig121 = (stress12_1(iw) + stress12_3(iw)) * p111
+         ssig122 = (stress12_2(iw) + stress12_4(iw)) * p111
+
+         csigpne = p111 * stressp_1(iw) + ssigp2 + p027 * stressp_3(iw)
+         csigpnw = p111 * stressp_2(iw) + ssigp1 + p027 * stressp_4(iw)
+         csigpsw = p111 * stressp_3(iw) + ssigp2 + p027 * stressp_1(iw)
+         csigpse = p111 * stressp_4(iw) + ssigp1 + p027 * stressp_2(iw)
+
+         csigmne = p111 * stressm_1(iw) + ssigm2 + p027 * stressm_3(iw)
+         csigmnw = p111 * stressm_2(iw) + ssigm1 + p027 * stressm_4(iw)
+         csigmsw = p111 * stressm_3(iw) + ssigm2 + p027 * stressm_1(iw)
+         csigmse = p111 * stressm_4(iw) + ssigm1 + p027 * stressm_2(iw)
+
+         csig12ne = p222 * stress12_1(iw) + ssig122 + p055 * stress12_3(iw)
+         csig12nw = p222 * stress12_2(iw) + ssig121 + p055 * stress12_4(iw)
+         csig12sw = p222 * stress12_3(iw) + ssig122 + p055 * stress12_1(iw)
+         csig12se = p222 * stress12_4(iw) + ssig121 + p055 * stress12_2(iw)
+
+         str12ew = p5 * dxt(iw) * (p333 * ssig12e + p166 * ssig12w)
+         str12we = p5 * dxt(iw) * (p333 * ssig12w + p166 * ssig12e)
+         str12ns = p5 * dyt(iw) * (p333 * ssig12n + p166 * ssig12s)
+         str12sn = p5 * dyt(iw) * (p333 * ssig12s + p166 * ssig12n)
+
+         !--------------------------------------------------------------
+         ! for dF/dx (u momentum)
+         !--------------------------------------------------------------
+
+         strp_tmp = p25 * dyt(iw) * (p333 * ssigpn + p166 * ssigps)
+         strm_tmp = p25 * dyt(iw) * (p333 * ssigmn + p166 * ssigms)
+
+         ! northeast (i,j)
+         str1(iw) = -strp_tmp - strm_tmp - str12ew &
+                  + dxhy * (-csigpne + csigmne) + dyhx * csig12ne
+
+         ! northwest (i+1,j)
+         str2(iw) = strp_tmp + strm_tmp - str12we &
+                  + dxhy * (-csigpnw + csigmnw) + dyhx * csig12nw
+
+         strp_tmp = p25 * dyt(iw) * (p333 * ssigps + p166 * ssigpn)
+         strm_tmp = p25 * dyt(iw) * (p333 * ssigms + p166 * ssigmn)
+
+         ! southeast (i,j+1)
+         str3(iw) = -strp_tmp - strm_tmp + str12ew &
+                  + dxhy * (-csigpse + csigmse) + dyhx * csig12se
+
+         ! southwest (i+1,j+1)
+         str4(iw) = strp_tmp + strm_tmp + str12we &
+                  + dxhy * (-csigpsw + csigmsw) + dyhx * csig12sw
+
+         !--------------------------------------------------------------
+         ! for dF/dy (v momentum)
+         !--------------------------------------------------------------
+
+         strp_tmp = p25 * dxt(iw) * (p333 * ssigpe + p166 * ssigpw)
+         strm_tmp = p25 * dxt(iw) * (p333 * ssigme + p166 * ssigmw)
+
+         ! northeast (i,j)
+         str5(iw) = -strp_tmp + strm_tmp - str12ns &
+                  - dyhx * (csigpne + csigmne) + dxhy * csig12ne
+
+         ! southeast (i,j+1)
+         str6(iw) = strp_tmp - strm_tmp - str12sn &
+                  - dyhx * (csigpse + csigmse) + dxhy * csig12se
+
+         strp_tmp = p25 * dxt(iw) * (p333 * ssigpw + p166 * ssigpe)
+         strm_tmp = p25 * dxt(iw) * (p333 * ssigmw + p166 * ssigme)
+
+         ! northwest (i+1,j)
+         str7(iw) = -strp_tmp + strm_tmp + str12ns &
+                  - dyhx * (csigpnw + csigmnw) + dxhy * csig12nw
+
+         ! southwest (i+1,j+1)
+         str8(iw) = strp_tmp - strm_tmp + str12sn &
+                  - dyhx * (csigpsw + csigmsw) + dxhy * csig12sw
+
+      end do
+#ifdef _OPENACC
+      !$acc end parallel
+#endif
 
-    ! proper action in "null" cases:
-    if (upper <= 0 .or. upper < lower) then
-      d_lower = 0
-      d_upper = -1
-      return
-    endif
+   end subroutine stress_iter
+
+!=======================================================================
+
+   subroutine stress_last(NA_len, ee, ne, se, lb, ub, uvel, vvel, dxt, &
+      dyt, hte, htn, htem1, htnm1, strength, stressp_1, stressp_2, &
+      stressp_3, stressp_4, stressm_1, stressm_2, stressm_3, &
+      stressm_4, stress12_1, stress12_2, stress12_3, stress12_4, str1, &
+      str2, str3, str4, str5, str6, str7, str8, skiptcell, tarear, divu, &
+      rdg_conv, rdg_shear, shear)
+
+      use ice_kinds_mod
+      use ice_constants, only : p027, p055, p111, p166, p222, p25, &
+          p333, p5, c1p5, c1, c0
+      use ice_dyn_shared, only : ecci, denom1, arlx1i, Ktens, revp
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: NA_len, lb, ub
+      integer(kind=int_kind), dimension(:), intent(in), contiguous :: &
+         ee, ne, se
+      real(kind=dbl_kind), dimension(:), intent(in), contiguous :: &
+         strength, uvel, vvel, dxt, dyt, hte, htn, htem1, htnm1, tarear
+      logical(kind=log_kind), intent(in), dimension(:) :: skiptcell
+      real(kind=dbl_kind), dimension(:), intent(inout), contiguous :: &
+         stressp_1, stressp_2, stressp_3, stressp_4, stressm_1, &
+         stressm_2, stressm_3, stressm_4, stress12_1, stress12_2, &
+         stress12_3, stress12_4
+      real(kind=dbl_kind), dimension(:), intent(out), contiguous :: &
+         str1, str2, str3, str4, str5, str6, str7, str8, divu, &
+         rdg_conv, rdg_shear, shear
+
+      ! local variables
+
+      integer(kind=int_kind) :: iw, il, iu
+      real(kind=dbl_kind) :: puny, divune, divunw, divuse, divusw, &
+         tensionne, tensionnw, tensionse, tensionsw, shearne, shearnw, &
+         shearse, shearsw, Deltane, Deltanw, Deltase, Deltasw, c0ne, &
+         c0nw, c0se, c0sw, c1ne, c1nw, c1se, c1sw, ssigpn, ssigps, &
+         ssigpe, ssigpw, ssigmn, ssigms, ssigme, ssigmw, ssig12n, &
+         ssig12s, ssig12e, ssig12w, ssigp1, ssigp2, ssigm1, ssigm2, &
+         ssig121, ssig122, csigpne, csigpnw, csigpse, csigpsw, &
+         csigmne, csigmnw, csigmse, csigmsw, csig12ne, csig12nw, &
+         csig12se, csig12sw, str12ew, str12we, str12ns, str12sn, &
+         strp_tmp, strm_tmp, tmp_uvel_ee, tmp_vvel_se, tmp_vvel_ee, &
+         tmp_vvel_ne, tmp_uvel_ne, tmp_uvel_se, dxhy, dyhx, cxp, cyp, &
+         cxm, cym, tinyarea, tmparea
+
+      character(len=*), parameter :: subname = '(stress_last)'
+
+      call icepack_query_parameters(puny_out=puny)
+      call icepack_warnings_flush(nu_diag)
+      if (icepack_warnings_aborted()) then
+         call abort_ice(error_message=subname, file=__FILE__, &
+            line=__LINE__)
+      end if
 
-    ! proper action in serial sections
-    d_lower = lower
-    d_upper = upper
+#ifdef _OPENACC
+      !$acc parallel &
+      !$acc present(ee, ne, se, strength, uvel, vvel, dxt, dyt, hte, &
+      !$acc    htn, htem1, htnm1, str1, str2, str3, str4, str5, str6, &
+      !$acc    str7, str8, stressp_1, stressp_2, stressp_3, stressp_4, &
+      !$acc    stressm_1, stressm_2, stressm_3, stressm_4, stress12_1, &
+      !$acc    stress12_2, stress12_3, stress12_4, tarear, divu, &
+      !$acc    rdg_conv, rdg_shear, shear, skiptcell)
+      !$acc loop
+      do iw = 1, NA_len
+#else
+      call domp_get_domain(lb, ub, il, iu)
+      do iw = il, iu
+#endif
 
-#if defined (_OPENMP)
-    if (omp_in_parallel()) then
-      dlen    = real(upper-lower+1, dbl_kind)
-      d_lower = lower    + floor((rdomp_iam*dlen+p5)/rdomp_nt, JPIM)
-      d_upper = lower -1 + floor((rdomp_iam*dlen+dlen+p5)/rdomp_nt, JPIM)
-    endif
+         if (skiptcell(iw)) cycle
+
+         tmparea = dxt(iw) * dyt(iw) ! necessary to split calc of tinyarea. Otherwize not binary identical
+         tinyarea = puny * tmparea
+         dxhy     = p5 * (hte(iw) - htem1(iw))
+         dyhx     = p5 * (htn(iw) - htnm1(iw))
+         cxp      = c1p5 * htn(iw) - p5 * htnm1(iw)
+         cyp      = c1p5 * hte(iw) - p5 * htem1(iw)
+         cxm      = -(c1p5 * htnm1(iw) - p5 * htn(iw))
+         cym      = -(c1p5 * htem1(iw) - p5 * hte(iw))
+
+         !--------------------------------------------------------------
+         ! strain rates
+         ! NOTE: these are actually strain rates * area (m^2/s)
+         !--------------------------------------------------------------
+
+         tmp_uvel_ne = uvel(ne(iw))
+         tmp_uvel_se = uvel(se(iw))
+         tmp_uvel_ee = uvel(ee(iw))
+
+         tmp_vvel_ee = vvel(ee(iw))
+         tmp_vvel_se = vvel(se(iw))
+         tmp_vvel_ne = vvel(ne(iw))
+
+         ! divergence = e_11 + e_22
+         divune = cyp * uvel(iw)    - dyt(iw) * tmp_uvel_ee &
+                + cxp * vvel(iw)    - dxt(iw) * tmp_vvel_se
+         divunw = cym * tmp_uvel_ee + dyt(iw) * uvel(iw) &
+                + cxp * tmp_vvel_ee - dxt(iw) * tmp_vvel_ne
+         divusw = cym * tmp_uvel_ne + dyt(iw) * tmp_uvel_se &
+                + cxm * tmp_vvel_ne + dxt(iw) * tmp_vvel_ee
+         divuse = cyp * tmp_uvel_se - dyt(iw) * tmp_uvel_ne &
+                + cxm * tmp_vvel_se + dxt(iw) * vvel(iw)
+
+         ! tension strain rate = e_11 - e_22
+         tensionne = -cym * uvel(iw)    - dyt(iw) * tmp_uvel_ee &
+                   +  cxm * vvel(iw)    + dxt(iw) * tmp_vvel_se
+         tensionnw = -cyp * tmp_uvel_ee + dyt(iw) * uvel(iw) &
+                   +  cxm * tmp_vvel_ee + dxt(iw) * tmp_vvel_ne
+         tensionsw = -cyp * tmp_uvel_ne + dyt(iw) * tmp_uvel_se &
+                   +  cxp * tmp_vvel_ne - dxt(iw) * tmp_vvel_ee
+         tensionse = -cym * tmp_uvel_se - dyt(iw) * tmp_uvel_ne &
+                   +  cxp * tmp_vvel_se - dxt(iw) * vvel(iw)
+
+         ! shearing strain rate = 2 * e_12
+         shearne = -cym * vvel(iw)    - dyt(iw) * tmp_vvel_ee &
+                 -  cxm * uvel(iw)    - dxt(iw) * tmp_uvel_se
+         shearnw = -cyp * tmp_vvel_ee + dyt(iw) * vvel(iw) &
+                 -  cxm * tmp_uvel_ee - dxt(iw) * tmp_uvel_ne
+         shearsw = -cyp * tmp_vvel_ne + dyt(iw) * tmp_vvel_se &
+                 -  cxp * tmp_uvel_ne + dxt(iw) * tmp_uvel_ee
+         shearse = -cym * tmp_vvel_se - dyt(iw) * tmp_vvel_ne &
+                 -  cxp * tmp_uvel_se + dxt(iw) * uvel(iw)
+
+         ! Delta (in the denominator of zeta and eta)
+         Deltane = sqrt(divune**2 + ecci * (tensionne**2 + shearne**2))
+         Deltanw = sqrt(divunw**2 + ecci * (tensionnw**2 + shearnw**2))
+         Deltasw = sqrt(divusw**2 + ecci * (tensionsw**2 + shearsw**2))
+         Deltase = sqrt(divuse**2 + ecci * (tensionse**2 + shearse**2))
+
+         !--------------------------------------------------------------
+         ! on last subcycle, save quantities for mechanical
+         ! redistribution
+         !--------------------------------------------------------------
+
+         divu(iw) = p25 * (divune + divunw + divuse + divusw) * tarear(iw)
+         rdg_conv(iw) = -min(divu(iw), c0)  ! TODO: Could move outside the entire kernel
+         rdg_shear(iw) = p5 * (p25 * (Deltane + Deltanw + Deltase + Deltasw) * tarear(iw) - abs(divu(iw)))
+
+         ! diagnostic only
+         ! shear = sqrt(tension**2 + shearing**2)
+         shear(iw) = p25 * tarear(iw) * sqrt((tensionne + tensionnw + tensionse + tensionsw)**2 &
+                                             + (shearne + shearnw + shearse + shearsw)**2)
+
+         !--------------------------------------------------------------
+         ! replacement pressure/Delta (kg/s)
+         ! save replacement pressure for principal stress calculation
+         !--------------------------------------------------------------
+
+         c0ne = strength(iw) / max(Deltane, tinyarea)
+         c0nw = strength(iw) / max(Deltanw, tinyarea)
+         c0sw = strength(iw) / max(Deltasw, tinyarea)
+         c0se = strength(iw) / max(Deltase, tinyarea)
+
+         c1ne = c0ne * arlx1i
+         c1nw = c0nw * arlx1i
+         c1sw = c0sw * arlx1i
+         c1se = c0se * arlx1i
+
+         c0ne = c1ne * ecci
+         c0nw = c1nw * ecci
+         c0sw = c1sw * ecci
+         c0se = c1se * ecci
+
+         !--------------------------------------------------------------
+         ! the stresses (kg/s^2)
+         ! (1) northeast, (2) northwest, (3) southwest, (4) southeast
+         !--------------------------------------------------------------
+
+         stressp_1(iw) = (stressp_1(iw) * (c1 - arlx1i * revp) &
+                       + c1ne * (divune * (c1 + Ktens) - Deltane * (c1 - Ktens))) * denom1
+         stressp_2(iw) = (stressp_2(iw) * (c1 - arlx1i * revp) &
+                       + c1nw * (divunw * (c1 + Ktens) - Deltanw * (c1 - Ktens))) * denom1
+         stressp_3(iw) = (stressp_3(iw) * (c1 - arlx1i * revp) &
+                       + c1sw * (divusw * (c1 + Ktens) - Deltasw * (c1 - Ktens))) * denom1
+         stressp_4(iw) = (stressp_4(iw) * (c1 - arlx1i * revp) &
+                       + c1se * (divuse * (c1 + Ktens) - Deltase * (c1 - Ktens))) * denom1
+
+         stressm_1(iw) = (stressm_1(iw) * (c1 - arlx1i * revp) + c0ne * tensionne * (c1 + Ktens)) * denom1
+         stressm_2(iw) = (stressm_2(iw) * (c1 - arlx1i * revp) + c0nw * tensionnw * (c1 + Ktens)) * denom1
+         stressm_3(iw) = (stressm_3(iw) * (c1 - arlx1i * revp) + c0sw * tensionsw * (c1 + Ktens)) * denom1
+         stressm_4(iw) = (stressm_4(iw) * (c1 - arlx1i * revp) + c0se * tensionse * (c1 + Ktens)) * denom1
+
+         stress12_1(iw) = (stress12_1(iw) * (c1 - arlx1i * revp) + c0ne * shearne * p5 * (c1 + Ktens)) * denom1
+         stress12_2(iw) = (stress12_2(iw) * (c1 - arlx1i * revp) + c0nw * shearnw * p5 * (c1 + Ktens)) * denom1
+         stress12_3(iw) = (stress12_3(iw) * (c1 - arlx1i * revp) + c0sw * shearsw * p5 * (c1 + Ktens)) * denom1
+         stress12_4(iw) = (stress12_4(iw) * (c1 - arlx1i * revp) + c0se * shearse * p5 * (c1 + Ktens)) * denom1
+
+         !--------------------------------------------------------------
+         ! combinations of the stresses for the momentum equation
+         ! (kg/s^2)
+         !--------------------------------------------------------------
+
+         ssigpn =  stressp_1(iw) + stressp_2(iw)
+         ssigps =  stressp_3(iw) + stressp_4(iw)
+         ssigpe =  stressp_1(iw) + stressp_4(iw)
+         ssigpw =  stressp_2(iw) + stressp_3(iw)
+         ssigp1 = (stressp_1(iw) + stressp_3(iw)) * p055
+         ssigp2 = (stressp_2(iw) + stressp_4(iw)) * p055
+
+         ssigmn =  stressm_1(iw) + stressm_2(iw)
+         ssigms =  stressm_3(iw) + stressm_4(iw)
+         ssigme =  stressm_1(iw) + stressm_4(iw)
+         ssigmw =  stressm_2(iw) + stressm_3(iw)
+         ssigm1 = (stressm_1(iw) + stressm_3(iw)) * p055
+         ssigm2 = (stressm_2(iw) + stressm_4(iw)) * p055
+
+         ssig12n =  stress12_1(iw) + stress12_2(iw)
+         ssig12s =  stress12_3(iw) + stress12_4(iw)
+         ssig12e =  stress12_1(iw) + stress12_4(iw)
+         ssig12w =  stress12_2(iw) + stress12_3(iw)
+         ssig121 = (stress12_1(iw) + stress12_3(iw)) * p111
+         ssig122 = (stress12_2(iw) + stress12_4(iw)) * p111
+
+         csigpne = p111 * stressp_1(iw) + ssigp2 + p027 * stressp_3(iw)
+         csigpnw = p111 * stressp_2(iw) + ssigp1 + p027 * stressp_4(iw)
+         csigpsw = p111 * stressp_3(iw) + ssigp2 + p027 * stressp_1(iw)
+         csigpse = p111 * stressp_4(iw) + ssigp1 + p027 * stressp_2(iw)
+
+         csigmne = p111 * stressm_1(iw) + ssigm2 + p027 * stressm_3(iw)
+         csigmnw = p111 * stressm_2(iw) + ssigm1 + p027 * stressm_4(iw)
+         csigmsw = p111 * stressm_3(iw) + ssigm2 + p027 * stressm_1(iw)
+         csigmse = p111 * stressm_4(iw) + ssigm1 + p027 * stressm_2(iw)
+
+         csig12ne = p222 * stress12_1(iw) + ssig122 + p055 * stress12_3(iw)
+         csig12nw = p222 * stress12_2(iw) + ssig121 + p055 * stress12_4(iw)
+         csig12sw = p222 * stress12_3(iw) + ssig122 + p055 * stress12_1(iw)
+         csig12se = p222 * stress12_4(iw) + ssig121 + p055 * stress12_2(iw)
+
+         str12ew = p5 * dxt(iw) * (p333 * ssig12e + p166 * ssig12w)
+         str12we = p5 * dxt(iw) * (p333 * ssig12w + p166 * ssig12e)
+         str12ns = p5 * dyt(iw) * (p333 * ssig12n + p166 * ssig12s)
+         str12sn = p5 * dyt(iw) * (p333 * ssig12s + p166 * ssig12n)
+
+         !--------------------------------------------------------------
+         ! for dF/dx (u momentum)
+         !--------------------------------------------------------------
+
+         strp_tmp = p25 * dyt(iw) * (p333 * ssigpn + p166 * ssigps)
+         strm_tmp = p25 * dyt(iw) * (p333 * ssigmn + p166 * ssigms)
+
+         ! northeast (i,j)
+         str1(iw) = -strp_tmp - strm_tmp - str12ew &
+                  + dxhy * (-csigpne + csigmne) + dyhx * csig12ne
+
+         ! northwest (i+1,j)
+         str2(iw) = strp_tmp + strm_tmp - str12we &
+                  + dxhy * (-csigpnw + csigmnw) + dyhx * csig12nw
+
+         strp_tmp = p25 * dyt(iw) * (p333 * ssigps + p166 * ssigpn)
+         strm_tmp = p25 * dyt(iw) * (p333 * ssigms + p166 * ssigmn)
+
+         ! southeast (i,j+1)
+         str3(iw) = -strp_tmp - strm_tmp + str12ew &
+                  + dxhy * (-csigpse + csigmse) + dyhx * csig12se
+
+         ! southwest (i+1,j+1)
+         str4(iw) = strp_tmp + strm_tmp + str12we &
+                  + dxhy * (-csigpsw + csigmsw) + dyhx * csig12sw
+
+         !--------------------------------------------------------------
+         ! for dF/dy (v momentum)
+         !--------------------------------------------------------------
+
+         strp_tmp = p25 * dxt(iw) * (p333 * ssigpe + p166 * ssigpw)
+         strm_tmp = p25 * dxt(iw) * (p333 * ssigme + p166 * ssigmw)
+
+         ! northeast (i,j)
+         str5(iw) = -strp_tmp + strm_tmp - str12ns &
+                  - dyhx * (csigpne + csigmne) + dxhy * csig12ne
+
+         ! southeast (i,j+1)
+         str6(iw) = strp_tmp - strm_tmp - str12sn &
+                  - dyhx * (csigpse + csigmse) + dxhy * csig12se
+
+         strp_tmp = p25 * dxt(iw) * (p333 * ssigpw + p166 * ssigpe)
+         strm_tmp = p25 * dxt(iw) * (p333 * ssigmw + p166 * ssigme)
+
+         ! northwest (i+1,j)
+         str7(iw) = -strp_tmp + strm_tmp + str12ns &
+                  - dyhx * (csigpnw + csigmnw) + dxhy * csig12nw
+
+         ! southwest (i+1,j+1)
+         str8(iw) = strp_tmp - strm_tmp + str12sn &
+                  - dyhx * (csigpsw + csigmsw) + dxhy * csig12sw
+
+      end do
+#ifdef _OPENACC
+      !$acc end parallel
 #endif
 
-  if (.false.) then
-    write(nu_diag,'(2a,i3,a,2i10)') subname,' openMP thread ', domp_iam, &
-         ' handles range: ', d_lower, d_upper
-  endif
+   end subroutine stress_last
 
-  end subroutine domp_get_domain_rlu
+!=======================================================================
 
-!----------------------------------------------------------------------------
+   subroutine stepu_iter(NA_len, rhow, lb, ub, Cw, aiu, uocn, vocn, &
+      forcex, forcey, umassdti, fm, uarear, Tbu, uvel_init, vvel_init, &
+      uvel, vvel, str1, str2, str3, str4, str5, str6, str7, str8, nw, &
+      sw, sse, skipucell)
 
-  subroutine domp_get_thread_no (tnum)
+      use ice_kinds_mod
+      use ice_constants, only : c0, c1
+      use ice_dyn_shared, only : brlx, revp, u0, cosw, sinw
 
-    implicit none
-    integer(int_kind), intent(out) :: tnum
-    character(len=*), parameter :: subname = '(domp_get_thread_no)'
+      implicit none
 
-    tnum = domp_iam + 1
+      integer(kind=int_kind), intent(in) :: NA_len, lb, ub
+      real(kind=dbl_kind), intent(in) :: rhow
+      logical(kind=log_kind), intent(in), dimension(:) :: skipucell
+      integer(kind=int_kind), dimension(:), intent(in), contiguous :: &
+         nw, sw, sse
+      real(kind=dbl_kind), dimension(:), intent(in), contiguous :: &
+         uvel_init, vvel_init, aiu, forcex, forcey, umassdti, Tbu, &
+         uocn, vocn, fm, uarear, Cw, str1, str2, str3, str4, str5, &
+         str6, str7, str8
+      real(kind=dbl_kind), dimension(:), intent(inout), contiguous :: &
+         uvel, vvel
 
-  end subroutine domp_get_thread_no
+      ! local variables
 
-!----------------------------------------------------------------------------
+      integer(kind=int_kind) :: iw, il, iu
+      real(kind=dbl_kind) :: uold, vold, vrel, cca, ccb, ab2, cc1, &
+         cc2, taux, tauy, Cb, tmp_str2_nw, tmp_str3_sse, tmp_str4_sw, &
+         tmp_str6_sse, tmp_str7_nw, tmp_str8_sw, waterx, watery, &
+         tmp_strintx, tmp_strinty
 
-!former end module dmi_omp
+      character(len=*), parameter :: subname = '(stepu_iter)'
 
-!===============================================================================
+#ifdef _OPENACC
+      !$acc parallel &
+      !$acc present(Cw, aiu, uocn, vocn, forcex, forcey, umassdti, fm, &
+      !$acc    uarear, Tbu, uvel_init, vvel_init, nw, sw, sse, skipucell, &
+      !$acc    str1, str2, str3, str4, str5, str6, str7, str8, uvel, &
+      !$acc    vvel)
+      !$acc loop
+      do iw = 1, NA_len
+#else
+      call domp_get_domain(lb, ub, il, iu)
+      do iw = il, iu
+#endif
 
-!former module bench_v2
+         if (skipucell(iw)) cycle
 
-!----------------------------------------------------------------------------
+         uold = uvel(iw)
+         vold = vvel(iw)
 
-    subroutine stress_i(NA_len, &
-                     ee,ne,se,lb,ub,uvel,vvel,dxt,dyt,                           & 
-                     hte,htn,htem1,htnm1,                                        &
-                     strength,stressp_1,stressp_2,stressp_3,stressp_4,           &
-                     stressm_1,stressm_2,stressm_3,stressm_4,stress12_1,         &
-                     stress12_2,stress12_3,stress12_4,str1,str2,str3,str4,str5,  &
-                     str6,str7,str8)
+         vrel = aiu(iw) * rhow * Cw(iw) * sqrt((uocn(iw) - uold)**2 + (vocn(iw) - vold)**2)
 
-    use ice_kinds_mod
-    use ice_constants, only: p027, p055, p111, p166, p222, p25, p333, p5, c1p5, c1
-    use ice_dyn_shared, only: ecci, denom1, arlx1i, Ktens, revp
+         waterx = uocn(iw) * cosw - vocn(iw) * sinw * sign(c1, fm(iw))
+         watery = vocn(iw) * cosw + uocn(iw) * sinw * sign(c1, fm(iw))
 
-    implicit none
+         taux = vrel * waterx
+         tauy = vrel * watery
 
-    integer (kind=int_kind), intent(in) :: NA_len
-    integer (kind=int_kind), intent(in) :: lb,ub
-    integer (kind=int_kind), dimension(:), intent(in),    contiguous :: ee,ne,se
-    real    (kind=dbl_kind), dimension(:), intent(in),    contiguous ::          &
-       strength, uvel, vvel, dxt, dyt,                                           &
-       hte,htn,htem1,htnm1
-    real    (kind=dbl_kind), dimension(:), intent(inout), contiguous ::          &
-       stressp_1,stressp_2,  stressp_3, stressp_4, stressm_1,  stressm_2,        &
-       stressm_3,stressm_4, stress12_1,stress12_2,stress12_3, stress12_4
-    real    (kind=DBL_KIND), dimension(:), intent(out),   contiguous ::          &
-       str1,str2,str3,str4,str5,str6,str7,str8
+         Cb = Tbu(iw) / (sqrt(uold**2 + vold**2) + u0)
 
-    !-- local variables
+         cca = (brlx + revp) * umassdti(iw) + vrel * cosw + Cb
+         ccb = fm(iw) + sign(c1, fm(iw)) * vrel * sinw
 
-    integer (kind=int_kind) :: iw,il,iu
-    real    (kind=dbl_kind) ::                                                   &
-      puny
-    real    (kind=DBL_KIND) ::                                                   &
-      divune, divunw, divuse, divusw,tensionne, tensionnw, tensionse, tensionsw, & 
-      shearne, shearnw, shearse, shearsw, Deltane, Deltanw, Deltase, Deltasw   , &
-      c0ne, c0nw, c0se, c0sw, c1ne, c1nw, c1se, c1sw                           , &
-      ssigpn, ssigps, ssigpe, ssigpw, ssigmn, ssigms, ssigme, ssigmw           , &
-      ssig12n, ssig12s, ssig12e, ssig12w, ssigp1, ssigp2,ssigm1, ssigm2,ssig121, &
-      ssig122, csigpne, csigpnw, csigpse, csigpsw,csigmne, csigmnw, csigmse    , &
-      csigmsw, csig12ne, csig12nw, csig12se, csig12sw, str12ew, str12we,str12ns, &
-      str12sn, strp_tmp, strm_tmp, tmp_uvel_ee, tmp_vvel_se, tmp_vvel_ee,        &
-      tmp_vvel_ne, tmp_uvel_ne, tmp_uvel_se
-    real    (kind=DBL_KIND) :: dxhy,dyhx,cxp,cyp,cxm,cym,tinyarea
-   
-    character(len=*), parameter :: subname = '(stress_i)'
-    !---------------------------------------
+         ab2 = cca**2 + ccb**2
 
-    call icepack_query_parameters(puny_out=puny)
-    call icepack_warnings_flush(nu_diag)
-    if (icepack_warnings_aborted()) call abort_ice(error_message=subname, &
-        file=__FILE__, line=__LINE__)
+         tmp_str2_nw = str2(nw(iw))
+         tmp_str3_sse = str3(sse(iw))
+         tmp_str4_sw = str4(sw(iw))
+         tmp_str6_sse = str6(sse(iw))
+         tmp_str7_nw = str7(nw(iw))
+         tmp_str8_sw = str8(sw(iw))
 
-#ifdef _OPENACC
-    !$acc parallel                                                                 &
-    !$acc present(ee,ne,se,strength,uvel,vvel,dxt,dyt,                             &
-    !$acc         hte, htn, htem1, htnm1,                                          &
-    !$acc         str1,str2,str3,str4,str5,str6,str7,str8,                         &
-    !$acc         stressp_1,stressp_2,stressp_3,stressp_4,                         &
-    !$acc         stressm_1,stressm_2,stressm_3,stressm_4,                         &
-    !$acc         stress12_1,stress12_2,stress12_3,stress12_4)
-    !$acc loop 
-    do iw = 1,NA_len
-#else
-    call domp_get_domain(lb,ub,il,iu)
-    do iw = il, iu
-#endif
-      tinyarea = puny*dxt(iw)*dyt(iw)
-      dxhy =     p5*(hte(iw)  - htem1(iw))
-      dyhx =     p5*(htn(iw)  - htnm1(iw))
-      cxp  =   c1p5*htn(iw)   - p5*htnm1(iw)
-      cyp  =   c1p5*hte(iw)   - p5*htem1(iw)
-      cxm  = -(c1p5*htnm1(iw) - p5*htn(iw))
-      cym  = -(c1p5*htem1(iw) - p5*hte(iw))
-  
-      !-----------------------------------------------------------------
-      ! strain rates
-      ! NOTE these are actually strain rates * area  (m^2/s)
-      !-----------------------------------------------------------------
-      ! divergence  =  e_11 + e_22
-      tmp_uvel_ee = uvel(ee(iw))
-      tmp_vvel_ee = vvel(ee(iw))
-  
-      tmp_vvel_se = vvel(se(iw))
-      tmp_uvel_se = uvel(se(iw))
-  
-       ! ne
-      divune    = cyp*uvel(iw) - dyt(iw)*tmp_uvel_ee                       &
-                + cxp*vvel(iw) - dxt(iw)*tmp_vvel_se
-      ! tension strain rate  =  e_11 - e_22
-      tensionne = -cym*uvel(iw) - dyt(iw)*tmp_uvel_ee                      &
-                +  cxm*vvel(iw) + dxt(iw)*tmp_vvel_se
-      ! shearing strain rate  =  2*e_12
-      shearne = -cym*vvel(iw) - dyt(iw)*tmp_vvel_ee                        &
-              -  cxm*uvel(iw) - dxt(iw)*tmp_uvel_se
-      ! Delta (in the denominator of zeta, eta)
-      Deltane = sqrt(divune**2 + ecci*(tensionne**2 + shearne**2))
-  
-      ! These two can move after ne block
-      !
-      tmp_uvel_ne = uvel(ne(iw))
-      tmp_vvel_ne = vvel(ne(iw))
-  
-      ! nw
-      divunw    = cym*tmp_uvel_ee + dyt(iw)*uvel(iw)                       &
-                + cxp*tmp_vvel_ee - dxt(iw)*tmp_vvel_ne
-      tensionnw = -cyp*tmp_uvel_ee + dyt(iw)*uvel(iw)                      &
-                +  cxm*tmp_vvel_ee + dxt(iw)*tmp_vvel_ne
-      shearnw = -cyp*tmp_vvel_ee + dyt(iw)*vvel(iw)                        &
-              -  cxm*tmp_uvel_ee - dxt(iw)*tmp_uvel_ne
-      Deltanw = sqrt(divunw**2 + ecci*(tensionnw**2 + shearnw**2))
-  
-      ! sw
-      divusw    = cym*tmp_uvel_ne + dyt(iw)*tmp_uvel_se                   &
-                + cxm*tmp_vvel_ne + dxt(iw)*tmp_vvel_ee
-      tensionsw = -cyp*tmp_uvel_ne + dyt(iw)*tmp_uvel_se                  &
-                +  cxp*tmp_vvel_ne - dxt(iw)*tmp_vvel_ee
-      shearsw = -cyp*tmp_vvel_ne + dyt(iw)*tmp_vvel_se                    &
-              -  cxp*tmp_uvel_ne + dxt(iw)*tmp_uvel_ee
-      Deltasw = sqrt(divusw**2 + ecci*(tensionsw**2 + shearsw**2))
-  
-      ! se
-      divuse    = cyp*tmp_uvel_se - dyt(iw)*tmp_uvel_ne                   &
-                + cxm*tmp_vvel_se + dxt(iw)*vvel(iw)
-      tensionse = -cym*tmp_uvel_se - dyt(iw)*tmp_uvel_ne                  &
-                +  cxp*tmp_vvel_se - dxt(iw)*vvel(iw)
-      shearse = -cym*tmp_vvel_se - dyt(iw)*tmp_vvel_ne                    &
-              -  cxp*tmp_uvel_se + dxt(iw)*uvel(iw)
-      Deltase = sqrt(divuse**2 + ecci*(tensionse**2 + shearse**2))
-  
-    !-----------------------------------------------------------------
-    ! replacement pressure/Delta                   ! kg/s
-    ! save replacement pressure for principal stress calculation
-    !-----------------------------------------------------------------
-      c0ne = strength(iw)/max(Deltane,tinyarea)
-      c0nw = strength(iw)/max(Deltanw,tinyarea)
-      c0sw = strength(iw)/max(Deltasw,tinyarea)
-      c0se = strength(iw)/max(Deltase,tinyarea)
-  
-      c1ne = c0ne*arlx1i
-      c1nw = c0nw*arlx1i
-      c1sw = c0sw*arlx1i
-      c1se = c0se*arlx1i
-  
-      c0ne = c1ne*ecci
-      c0nw = c1nw*ecci
-      c0sw = c1sw*ecci
-      c0se = c1se*ecci
-  
-    !-----------------------------------------------------------------
-    ! the stresses                            ! kg/s^2
-    ! (1) northeast, (2) northwest, (3) southwest, (4) southeast
-    !-----------------------------------------------------------------
-  
-      stressp_1(iw) = (stressp_1(iw)*(c1-arlx1i*revp) + c1ne*(divune*(c1+Ktens) - Deltane*(c1-Ktens))) * denom1
-      stressp_2(iw) = (stressp_2(iw)*(c1-arlx1i*revp) + c1nw*(divunw*(c1+Ktens) - Deltanw*(c1-Ktens))) * denom1
-      stressp_3(iw) = (stressp_3(iw)*(c1-arlx1i*revp) + c1sw*(divusw*(c1+Ktens) - Deltasw*(c1-Ktens))) * denom1
-      stressp_4(iw) = (stressp_4(iw)*(c1-arlx1i*revp) + c1se*(divuse*(c1+Ktens) - Deltase*(c1-Ktens))) * denom1
-  
-      stressm_1(iw) = (stressm_1(iw)*(c1-arlx1i*revp) + c0ne*tensionne*(c1+Ktens)) * denom1
-      stressm_2(iw) = (stressm_2(iw)*(c1-arlx1i*revp) + c0nw*tensionnw*(c1+Ktens)) * denom1
-      stressm_3(iw) = (stressm_3(iw)*(c1-arlx1i*revp) + c0sw*tensionsw*(c1+Ktens)) * denom1
-      stressm_4(iw) = (stressm_4(iw)*(c1-arlx1i*revp) + c0se*tensionse*(c1+Ktens)) * denom1
-      
-      stress12_1(iw) = (stress12_1(iw)*(c1-arlx1i*revp) + c0ne*shearne*p5*(c1+Ktens)) * denom1
-      stress12_2(iw) = (stress12_2(iw)*(c1-arlx1i*revp) + c0nw*shearnw*p5*(c1+Ktens)) * denom1
-      stress12_3(iw) = (stress12_3(iw)*(c1-arlx1i*revp) + c0sw*shearsw*p5*(c1+Ktens)) * denom1
-      stress12_4(iw) = (stress12_4(iw)*(c1-arlx1i*revp) + c0se*shearse*p5*(c1+Ktens)) * denom1
-
-    !-----------------------------------------------------------------
-    ! combinations of the stresses for the momentum equation ! kg/s^2
-    !-----------------------------------------------------------------
-  
-      ssigpn  = stressp_1(iw) + stressp_2(iw)
-      ssigps  = stressp_3(iw) + stressp_4(iw)
-      ssigpe  = stressp_1(iw) + stressp_4(iw)
-      ssigpw  = stressp_2(iw) + stressp_3(iw)
-      ssigp1  =(stressp_1(iw) + stressp_3(iw))*p055
-      ssigp2  =(stressp_2(iw) + stressp_4(iw))*p055
-  
-      ssigmn  = stressm_1(iw) + stressm_2(iw)
-      ssigms  = stressm_3(iw) + stressm_4(iw)
-      ssigme  = stressm_1(iw) + stressm_4(iw)
-      ssigmw  = stressm_2(iw) + stressm_3(iw)
-      ssigm1  =(stressm_1(iw) + stressm_3(iw))*p055
-      ssigm2  =(stressm_2(iw) + stressm_4(iw))*p055
-  
-      ssig12n = stress12_1(iw) + stress12_2(iw)
-      ssig12s = stress12_3(iw) + stress12_4(iw)
-      ssig12e = stress12_1(iw) + stress12_4(iw)
-      ssig12w = stress12_2(iw) + stress12_3(iw)
-      ssig121 =(stress12_1(iw) + stress12_3(iw))*p111
-      ssig122 =(stress12_2(iw) + stress12_4(iw))*p111
-  
-      csigpne = p111*stressp_1(iw) + ssigp2 + p027*stressp_3(iw)
-      csigpnw = p111*stressp_2(iw) + ssigp1 + p027*stressp_4(iw)
-      csigpsw = p111*stressp_3(iw) + ssigp2 + p027*stressp_1(iw)
-      csigpse = p111*stressp_4(iw) + ssigp1 + p027*stressp_2(iw)
-      
-      csigmne = p111*stressm_1(iw) + ssigm2 + p027*stressm_3(iw)
-      csigmnw = p111*stressm_2(iw) + ssigm1 + p027*stressm_4(iw)
-      csigmsw = p111*stressm_3(iw) + ssigm2 + p027*stressm_1(iw)
-      csigmse = p111*stressm_4(iw) + ssigm1 + p027*stressm_2(iw)
-      
-      csig12ne = p222*stress12_1(iw) + ssig122 + p055*stress12_3(iw)
-      csig12nw = p222*stress12_2(iw) + ssig121 + p055*stress12_4(iw)
-      csig12sw = p222*stress12_3(iw) + ssig122 + p055*stress12_1(iw)
-      csig12se = p222*stress12_4(iw) + ssig121 + p055*stress12_2(iw)
-  
-      str12ew = p5*dxt(iw)*(p333*ssig12e + p166*ssig12w)
-      str12we = p5*dxt(iw)*(p333*ssig12w + p166*ssig12e)
-      str12ns = p5*dyt(iw)*(p333*ssig12n + p166*ssig12s)
-      str12sn = p5*dyt(iw)*(p333*ssig12s + p166*ssig12n)
-  
-    !-----------------------------------------------------------------
-    ! for dF/dx (u momentum)
-    !-----------------------------------------------------------------
-      strp_tmp  = p25*dyt(iw)*(p333*ssigpn  + p166*ssigps)
-      strm_tmp  = p25*dyt(iw)*(p333*ssigmn  + p166*ssigms)
-  
-      ! northeast (iw)
-      str1(iw) = -strp_tmp - strm_tmp - str12ew &
-                 + dxhy*(-csigpne + csigmne) + dyhx*csig12ne
-  
-      ! northwest (i+1,j)
-      str2(iw) = strp_tmp + strm_tmp - str12we &
-                 + dxhy*(-csigpnw + csigmnw) + dyhx*csig12nw
-  
-      strp_tmp  = p25*dyt(iw)*(p333*ssigps  + p166*ssigpn)
-      strm_tmp  = p25*dyt(iw)*(p333*ssigms  + p166*ssigmn)
-  
-      ! southeast (i,j+1)
-      str3(iw) = -strp_tmp - strm_tmp + str12ew &
-                 + dxhy*(-csigpse + csigmse) + dyhx*csig12se
-  
-      ! southwest (i+1,j+1)
-      str4(iw) = strp_tmp + strm_tmp + str12we &
-                 + dxhy*(-csigpsw + csigmsw) + dyhx*csig12sw
-  
-    !-----------------------------------------------------------------
-    ! for dF/dy (v momentum)
-    !-----------------------------------------------------------------
-      strp_tmp  = p25*dxt(iw)*(p333*ssigpe  + p166*ssigpw)
-      strm_tmp  = p25*dxt(iw)*(p333*ssigme  + p166*ssigmw)
-  
-      ! northeast (i,j)
-      str5(iw) = -strp_tmp + strm_tmp - str12ns &
-                 - dyhx*(csigpne + csigmne) + dxhy*csig12ne
-  
-      ! southeast (i,j+1)
-      str6(iw) = strp_tmp - strm_tmp - str12sn &
-                 - dyhx*(csigpse + csigmse) + dxhy*csig12se
-  
-      strp_tmp  = p25*dxt(iw)*(p333*ssigpw  + p166*ssigpe)
-      strm_tmp  = p25*dxt(iw)*(p333*ssigmw  + p166*ssigme)
-  
-      ! northwest (i+1,j)
-      str7(iw) = -strp_tmp + strm_tmp + str12ns &
-                 - dyhx*(csigpnw + csigmnw) + dxhy*csig12nw
-  
-      ! southwest (i+1,j+1)
-      str8(iw) = strp_tmp - strm_tmp + str12sn &
-                 - dyhx*(csigpsw + csigmsw) + dxhy*csig12sw
-    enddo   
-#ifdef _OPENACC
-    !$acc end parallel
-#endif
+         tmp_strintx = uarear(iw) * (str1(iw) + tmp_str2_nw + tmp_str3_sse + tmp_str4_sw)
+         tmp_strinty = uarear(iw) * (str5(iw) + tmp_str6_sse + tmp_str7_nw + tmp_str8_sw)
 
-  end subroutine stress_i
-  
-!----------------------------------------------------------------------------
-
-  subroutine stress_l(NA_len, tarear, &
-                     ee,ne,se,lb,ub,uvel,vvel,dxt,dyt,                           & 
-                     hte,htn,htem1,htnm1,                                        &
-                     strength,stressp_1,stressp_2,stressp_3,stressp_4,           & 
-                     stressm_1,stressm_2,stressm_3,stressm_4,stress12_1,         &
-                     stress12_2,stress12_3,stress12_4,                           &
-                     divu,rdg_conv,rdg_shear,shear,                              &
-                     str1,str2,str3,str4,str5,str6,str7,str8                     )
-
-    use ice_kinds_mod
-    use ice_constants, only: p027, p055, p111, p166, p222, p25, p333, p5, c1p5, c0, c1
-    use ice_dyn_shared, only: ecci, denom1, arlx1i, Ktens, revp
-
-    implicit none
-
-    integer (kind=int_kind), intent(in) :: NA_len
-    integer (kind=int_kind), intent(in) :: lb,ub
-    integer (kind=int_kind), dimension(:), intent(in),    contiguous :: ee,ne,se
-    real    (kind=dbl_kind), dimension(:), intent(in),    contiguous ::          &
-       strength, uvel, vvel, dxt, dyt, tarear,                                   &
-       hte,htn,htem1,htnm1
-    real    (kind=dbl_kind), dimension(:), intent(inout), contiguous ::          &
-       stressp_1,stressp_2,  stressp_3, stressp_4, stressm_1,  stressm_2,        &
-       stressm_3,stressm_4, stress12_1,stress12_2,stress12_3, stress12_4
-    real    (kind=DBL_KIND), dimension(:), intent(out),   contiguous ::          &
-       str1,str2,str3,str4,str5,str6,str7,str8
-    real    (kind=dbl_kind), dimension(:), intent(out),   contiguous ::          &
-       divu,rdg_conv,rdg_shear,shear
-
-    !-- local variables
-
-    integer (kind=int_kind) :: iw,il,iu
-    real    (kind=dbl_kind) ::                                                   &
-      puny
-    real    (kind=DBL_KIND) ::                                                   &
-      divune, divunw, divuse, divusw,tensionne, tensionnw, tensionse, tensionsw, & 
-      shearne, shearnw, shearse, shearsw, Deltane, Deltanw, Deltase, Deltasw   , &
-      c0ne, c0nw, c0se, c0sw, c1ne, c1nw, c1se, c1sw                           , &
-      ssigpn, ssigps, ssigpe, ssigpw, ssigmn, ssigms, ssigme, ssigmw           , &
-      ssig12n, ssig12s, ssig12e, ssig12w, ssigp1, ssigp2,ssigm1, ssigm2,ssig121, &
-      ssig122, csigpne, csigpnw, csigpse, csigpsw,csigmne, csigmnw, csigmse    , &
-      csigmsw, csig12ne, csig12nw, csig12se, csig12sw, str12ew, str12we,str12ns, &
-      str12sn, strp_tmp, strm_tmp, tmp_uvel_ee, tmp_vvel_se, tmp_vvel_ee,        &
-      tmp_vvel_ne, tmp_uvel_ne, tmp_uvel_se
-    real    (kind=DBL_KIND) :: dxhy,dyhx,cxp,cyp,cxm,cym,tinyarea
-  
-    character(len=*), parameter :: subname = '(stress_l)'
-    !---------------------------------------
-
-    call icepack_query_parameters(puny_out=puny)
-    call icepack_warnings_flush(nu_diag)
-    if (icepack_warnings_aborted()) call abort_ice(error_message=subname, &
-        file=__FILE__, line=__LINE__)
+         cc1 = tmp_strintx + forcex(iw) + taux &
+             + umassdti(iw) * (brlx * uold + revp * uvel_init(iw))
+         cc2 = tmp_strinty + forcey(iw) + tauy &
+             + umassdti(iw) * (brlx * vold + revp * vvel_init(iw))
 
-#ifdef _OPENACC
-    !$acc parallel                                                                 &
-    !$acc present(ee,ne,se,strength,uvel,vvel,dxt,dyt,tarear,                      &
-    !$acc         hte,htn,htem1,htnm1,                                             &
-    !$acc         str1,str2,str3,str4,str5,str6,str7,str8,                         &
-    !$acc         stressp_1,stressp_2,stressp_3,stressp_4,                         &
-    !$acc         stressm_1,stressm_2,stressm_3,stressm_4,                         &
-    !$acc         stress12_1,stress12_2,stress12_3,stress12_4,                     &
-    !$acc         divu,rdg_conv,rdg_shear,shear)
-    !$acc loop 
-    do iw = 1,NA_len
-#else
-    call domp_get_domain(lb,ub,il,iu)
-    do iw = il, iu
-#endif
-      tinyarea = puny*dxt(iw)*dyt(iw)
-      dxhy =     p5*(hte(iw)  - htem1(iw))
-      dyhx =     p5*(htn(iw)  - htnm1(iw))
-      cxp  =   c1p5*htn(iw)   - p5*htnm1(iw)
-      cyp  =   c1p5*hte(iw)   - p5*htem1(iw)
-      cxm  = -(c1p5*htnm1(iw) - p5*htn(iw))
-      cym  = -(c1p5*htem1(iw) - p5*hte(iw))
-  
-    !-----------------------------------------------------------------
-    ! strain rates
-    ! NOTE these are actually strain rates * area  (m^2/s)
-    !-----------------------------------------------------------------
-      ! divergence  =  e_11 + e_22
-      tmp_uvel_ee = uvel(ee(iw))
-      tmp_vvel_se = vvel(se(iw))
-      tmp_vvel_ee = vvel(ee(iw))
-      tmp_vvel_ne = vvel(ne(iw))
-      tmp_uvel_ne = uvel(ne(iw))
-      tmp_uvel_se = uvel(se(iw))
-  
-      divune    = cyp*uvel(iw) - dyt(iw)*tmp_uvel_ee                       &
-                + cxp*vvel(iw) - dxt(iw)*tmp_vvel_se
-      divunw    = cym*tmp_uvel_ee + dyt(iw)*uvel(iw)                       &
-                + cxp*tmp_vvel_ee - dxt(iw)*tmp_vvel_ne
-      divusw    = cym*tmp_uvel_ne + dyt(iw)*tmp_uvel_se                   &
-                + cxm*tmp_vvel_ne + dxt(iw)*tmp_vvel_ee
-      divuse    = cyp*tmp_uvel_se - dyt(iw)*tmp_uvel_ne                   &
-                + cxm*tmp_vvel_se + dxt(iw)*vvel(iw)
-  
-      ! tension strain rate  =  e_11 - e_22
-      tensionne = -cym*uvel(iw) - dyt(iw)*tmp_uvel_ee                      &
-                +  cxm*vvel(iw) + dxt(iw)*tmp_vvel_se
-      tensionnw = -cyp*tmp_uvel_ee + dyt(iw)*uvel(iw)                      &
-                +  cxm*tmp_vvel_ee + dxt(iw)*tmp_vvel_ne
-      tensionsw = -cyp*tmp_uvel_ne + dyt(iw)*tmp_uvel_se                  &
-                +  cxp*tmp_vvel_ne - dxt(iw)*tmp_vvel_ee
-      tensionse = -cym*tmp_uvel_se - dyt(iw)*tmp_uvel_ne                  &
-                +  cxp*tmp_vvel_se - dxt(iw)*vvel(iw)
-  
-      ! shearing strain rate  =  2*e_12
-      shearne = -cym*vvel(iw) - dyt(iw)*tmp_vvel_ee                        &
-              -  cxm*uvel(iw) - dxt(iw)*tmp_uvel_se
-      shearnw = -cyp*tmp_vvel_ee + dyt(iw)*vvel(iw)                        &
-              -  cxm*tmp_uvel_ee - dxt(iw)*tmp_uvel_ne
-      shearsw = -cyp*tmp_vvel_ne + dyt(iw)*tmp_vvel_se                    &
-              -  cxp*tmp_uvel_ne + dxt(iw)*tmp_uvel_ee
-      shearse = -cym*tmp_vvel_se - dyt(iw)*tmp_vvel_ne                    &
-              -  cxp*tmp_uvel_se + dxt(iw)*uvel(iw)
-       
-      ! Delta (in the denominator of zeta, eta)
-      Deltane = sqrt(divune**2 + ecci*(tensionne**2 + shearne**2))
-      Deltanw = sqrt(divunw**2 + ecci*(tensionnw**2 + shearnw**2))
-      Deltase = sqrt(divuse**2 + ecci*(tensionse**2 + shearse**2))
-      Deltasw = sqrt(divusw**2 + ecci*(tensionsw**2 + shearsw**2))
-  
-      !-----------------------------------------------------------------
-      ! on last subcycle, save quantities for mechanical redistribution
-      !-----------------------------------------------------------------
-      divu(iw) = p25*(divune + divunw + divuse + divusw) * tarear(iw)
-      rdg_conv(iw)  = -min(divu(iw),c0)    ! Could move outside the entire "kernel"
-      rdg_shear(iw) = p5*( p25*(Deltane + Deltanw + Deltase + Deltasw) * tarear(iw) -abs(divu(iw)) )
-
-      ! diagnostic only
-      ! shear = sqrt(tension**2 + shearing**2)
-      shear(iw) = p25*tarear(iw)*sqrt( &
-                (tensionne + tensionnw + tensionse + tensionsw)**2 &
-               +  (shearne +   shearnw +   shearse +   shearsw)**2)
-
-    !-----------------------------------------------------------------
-    ! replacement pressure/Delta                   ! kg/s
-    ! save replacement pressure for principal stress calculation
-    !-----------------------------------------------------------------
-      c0ne = strength(iw)/max(Deltane,tinyarea)
-      c0nw = strength(iw)/max(Deltanw,tinyarea)
-      c0sw = strength(iw)/max(Deltasw,tinyarea)
-      c0se = strength(iw)/max(Deltase,tinyarea)
- 
-      c1ne = c0ne*arlx1i
-      c1nw = c0nw*arlx1i
-      c1sw = c0sw*arlx1i
-      c1se = c0se*arlx1i
- 
-      c0ne = c1ne*ecci
-      c0nw = c1nw*ecci
-      c0sw = c1sw*ecci
-      c0se = c1se*ecci
-  
-    !-----------------------------------------------------------------
-    ! the stresses                            ! kg/s^2
-    ! (1) northeast, (2) northwest, (3) southwest, (4) southeast
-    !-----------------------------------------------------------------
-
-      stressp_1(iw) = (stressp_1(iw)*(c1-arlx1i*revp) + c1ne*(divune*(c1+Ktens) - Deltane*(c1-Ktens))) * denom1
-      stressp_2(iw) = (stressp_2(iw)*(c1-arlx1i*revp) + c1nw*(divunw*(c1+Ktens) - Deltanw*(c1-Ktens))) * denom1
-      stressp_3(iw) = (stressp_3(iw)*(c1-arlx1i*revp) + c1sw*(divusw*(c1+Ktens) - Deltasw*(c1-Ktens))) * denom1
-      stressp_4(iw) = (stressp_4(iw)*(c1-arlx1i*revp) + c1se*(divuse*(c1+Ktens) - Deltase*(c1-Ktens))) * denom1
- 
-      stressm_1(iw) = (stressm_1(iw)*(c1-arlx1i*revp) + c0ne*tensionne*(c1+Ktens)) * denom1
-      stressm_2(iw) = (stressm_2(iw)*(c1-arlx1i*revp) + c0nw*tensionnw*(c1+Ktens)) * denom1
-      stressm_3(iw) = (stressm_3(iw)*(c1-arlx1i*revp) + c0sw*tensionsw*(c1+Ktens)) * denom1
-      stressm_4(iw) = (stressm_4(iw)*(c1-arlx1i*revp) + c0se*tensionse*(c1+Ktens)) * denom1
-
-      stress12_1(iw) = (stress12_1(iw)*(c1-arlx1i*revp) + c0ne*shearne*p5*(c1+Ktens)) * denom1
-      stress12_2(iw) = (stress12_2(iw)*(c1-arlx1i*revp) + c0nw*shearnw*p5*(c1+Ktens)) * denom1
-      stress12_3(iw) = (stress12_3(iw)*(c1-arlx1i*revp) + c0sw*shearsw*p5*(c1+Ktens)) * denom1
-      stress12_4(iw) = (stress12_4(iw)*(c1-arlx1i*revp) + c0se*shearse*p5*(c1+Ktens)) * denom1
- 
-    !-----------------------------------------------------------------
-    ! combinations of the stresses for the momentum equation ! kg/s^2
-    !-----------------------------------------------------------------
-  
-      ssigpn  = stressp_1(iw) + stressp_2(iw)
-      ssigps  = stressp_3(iw) + stressp_4(iw)
-      ssigpe  = stressp_1(iw) + stressp_4(iw)
-      ssigpw  = stressp_2(iw) + stressp_3(iw)
-      ssigp1  =(stressp_1(iw) + stressp_3(iw))*p055
-      ssigp2  =(stressp_2(iw) + stressp_4(iw))*p055
- 
-      ssigmn  = stressm_1(iw) + stressm_2(iw)
-      ssigms  = stressm_3(iw) + stressm_4(iw)
-      ssigme  = stressm_1(iw) + stressm_4(iw)
-      ssigmw  = stressm_2(iw) + stressm_3(iw)
-      ssigm1  =(stressm_1(iw) + stressm_3(iw))*p055
-      ssigm2  =(stressm_2(iw) + stressm_4(iw))*p055
- 
-      ssig12n = stress12_1(iw) + stress12_2(iw)
-      ssig12s = stress12_3(iw) + stress12_4(iw)
-      ssig12e = stress12_1(iw) + stress12_4(iw)
-      ssig12w = stress12_2(iw) + stress12_3(iw)
-      ssig121 =(stress12_1(iw) + stress12_3(iw))*p111
-      ssig122 =(stress12_2(iw) + stress12_4(iw))*p111
- 
-      csigpne = p111*stressp_1(iw) + ssigp2 + p027*stressp_3(iw)
-      csigpnw = p111*stressp_2(iw) + ssigp1 + p027*stressp_4(iw)
-      csigpsw = p111*stressp_3(iw) + ssigp2 + p027*stressp_1(iw)
-      csigpse = p111*stressp_4(iw) + ssigp1 + p027*stressp_2(iw)
-      
-      csigmne = p111*stressm_1(iw) + ssigm2 + p027*stressm_3(iw)
-      csigmnw = p111*stressm_2(iw) + ssigm1 + p027*stressm_4(iw)
-      csigmsw = p111*stressm_3(iw) + ssigm2 + p027*stressm_1(iw)
-      csigmse = p111*stressm_4(iw) + ssigm1 + p027*stressm_2(iw)
-      
-      csig12ne = p222*stress12_1(iw) + ssig122 + p055*stress12_3(iw)
-      csig12nw = p222*stress12_2(iw) + ssig121 + p055*stress12_4(iw)
-      csig12sw = p222*stress12_3(iw) + ssig122 + p055*stress12_1(iw)
-      csig12se = p222*stress12_4(iw) + ssig121 + p055*stress12_2(iw)
- 
-      str12ew = p5*dxt(iw)*(p333*ssig12e + p166*ssig12w)
-      str12we = p5*dxt(iw)*(p333*ssig12w + p166*ssig12e)
-      str12ns = p5*dyt(iw)*(p333*ssig12n + p166*ssig12s)
-      str12sn = p5*dyt(iw)*(p333*ssig12s + p166*ssig12n)
- 
-    !-----------------------------------------------------------------
-    ! for dF/dx (u momentum)
-    !-----------------------------------------------------------------
-      strp_tmp  = p25*dyt(iw)*(p333*ssigpn  + p166*ssigps)
-      strm_tmp  = p25*dyt(iw)*(p333*ssigmn  + p166*ssigms)
- 
-      ! northeast (iw)
-      str1(iw) = -strp_tmp - strm_tmp - str12ew &
-                 + dxhy*(-csigpne + csigmne) + dyhx*csig12ne
- 
-      ! northwest (i+1,j)
-      str2(iw) = strp_tmp + strm_tmp - str12we &
-                 + dxhy*(-csigpnw + csigmnw) + dyhx*csig12nw
- 
-      strp_tmp  = p25*dyt(iw)*(p333*ssigps  + p166*ssigpn)
-      strm_tmp  = p25*dyt(iw)*(p333*ssigms  + p166*ssigmn)
- 
-      ! southeast (i,j+1)
-      str3(iw) = -strp_tmp - strm_tmp + str12ew &
-                 + dxhy*(-csigpse + csigmse) + dyhx*csig12se
- 
-      ! southwest (i+1,j+1)
-      str4(iw) = strp_tmp + strm_tmp + str12we &
-                 + dxhy*(-csigpsw + csigmsw) + dyhx*csig12sw
- 
-    !-----------------------------------------------------------------
-    ! for dF/dy (v momentum)
-    !-----------------------------------------------------------------
-      strp_tmp  = p25*dxt(iw)*(p333*ssigpe  + p166*ssigpw)
-      strm_tmp  = p25*dxt(iw)*(p333*ssigme  + p166*ssigmw)
- 
-      ! northeast (i,j)
-      str5(iw) = -strp_tmp + strm_tmp - str12ns &
-                 - dyhx*(csigpne + csigmne) + dxhy*csig12ne
- 
-      ! southeast (i,j+1)
-      str6(iw) = strp_tmp - strm_tmp - str12sn &
-                 - dyhx*(csigpse + csigmse) + dxhy*csig12se
- 
-      strp_tmp  = p25*dxt(iw)*(p333*ssigpw  + p166*ssigpe)
-      strm_tmp  = p25*dxt(iw)*(p333*ssigmw  + p166*ssigme)
- 
-      ! northwest (i+1,j)
-      str7(iw) = -strp_tmp + strm_tmp + str12ns &
-                 - dyhx*(csigpnw + csigmnw) + dxhy*csig12nw
- 
-      ! southwest (i+1,j+1)
-      str8(iw) = strp_tmp - strm_tmp + str12sn &
-                 - dyhx*(csigpsw + csigmsw) + dxhy*csig12sw
-    enddo   
-#ifdef _OPENACC
-    !$acc end parallel
-#endif
-  end subroutine stress_l
-  
-!----------------------------------------------------------------------------
-
-  subroutine stepu_iter(NA_len,rhow, &
-                    lb,ub,Cw,aiu,uocn,vocn,forcex,forcey,umassdti,fm,uarear,Tbu, &
-                    uvel_init,vvel_init,uvel,vvel,                               &
-                    str1,str2,str3,str4,str5,str6,str7,str8, nw,sw,se,skipme)
-
-    use ice_kinds_mod
-    use ice_dyn_shared, only: brlx, revp
-    use ice_constants, only: c0, c1
-
-    implicit none
-
-    integer (kind=int_kind), intent(in) :: NA_len
-    real (kind=dbl_kind), intent(in) :: rhow
-    integer(kind=int_kind),intent(in)   :: lb,ub
-    logical(kind=log_kind),intent(in), dimension(:)   :: skipme
-    integer(kind=int_kind),dimension(:), intent(in), contiguous :: nw,sw,se
-    real(kind=dbl_kind),dimension(:), intent(in), contiguous    ::               &
-       uvel_init, vvel_init, aiu, forcex, forcey, umassdti, Tbu,                 &
-       uocn, vocn, fm, uarear,Cw
-    real(kind=DBL_KIND),dimension(:), intent(in), contiguous ::                  &
-       str1,str2,str3,str4,str5,str6,str7,str8
-    real(kind=dbl_kind),dimension(:), intent(inout), contiguous ::               &
-       uvel,vvel
-    real (kind=dbl_kind), parameter :: &
-           cosw = c1   , & ! cos(ocean turning angle)  ! turning angle = 0
-           sinw = c0   
-
-    !-- local variables
-
-    integer (kind=int_kind) :: iw,il,iu
-    real    (kind=dbl_kind) :: uold, vold, vrel,cca,ccb,ab2,cc1,cc2,taux,tauy,Cb
-    real    (kind=dbl_kind) :: tmp_str2_nw,tmp_str3_se,tmp_str4_sw, tmp_strintx
-    real    (kind=dbl_kind) :: tmp_str6_se,tmp_str7_nw,tmp_str8_sw, tmp_strinty
-    real    (kind=dbl_kind) :: waterx,watery
-    real    (kind=dbl_kind) :: u0 = 5.e-5_dbl_kind    ! residual velocity for seabed stress (m/s)
-  
-    character(len=*), parameter :: subname = '(stepu_iter)'
-    !---------------------------------------
+         uvel(iw) = (cca * cc1 + ccb * cc2) / ab2
+         vvel(iw) = (cca * cc2 - ccb * cc1) / ab2
 
+      end do
 #ifdef _OPENACC
-    !$acc parallel                                                       &
-    !$acc present(Cw,aiu,uocn,vocn,forcex,forcey,umassdti,fm,uarear,Tbu, &
-    !$acc         uvel_init,vvel_init,nw,sw,se,skipme,                   &
-    !$acc         str1,str2,str3,str4,str5,str6,str7,str8,uvel,vvel)
-    !$acc loop 
-    do iw = 1,NA_len
-#else
-    call domp_get_domain(lb,ub,il,iu)
-    do iw = il, iu
-#endif
-       if (skipme(iw)) cycle
-       uold = uvel(iw)
-       vold = vvel(iw)
-       vrel = aiu(iw)*rhow*Cw(iw)*sqrt((uocn(iw)-uold)**2+(vocn(iw)-vold)**2)
-       waterx = uocn(iw)*cosw - vocn(iw)*sinw*sign(c1,fm(iw))
-       watery = vocn(iw)*cosw + uocn(iw)*sinw*sign(c1,fm(iw))
-       taux = vrel*waterx 
-       tauy = vrel*watery
-       Cb  = Tbu(iw) / (sqrt(uold**2 + vold**2) + u0)
-       cca = (brlx + revp)*umassdti(iw) + vrel * cosw + Cb 
-       ccb = fm(iw) + sign(c1,fm(iw)) * vrel * sinw 
-       ab2 = cca**2 + ccb**2
-       ! southeast(i,j+1)   = se
-       ! northwest(i+1,j)   = nw
-       ! southwest(i+1,j+1) = sw
-       tmp_str2_nw = str2(nw(iw))
-       tmp_str3_se = str3(se(iw))
-       tmp_str4_sw = str4(sw(iw))
-       tmp_str6_se = str6(se(iw))
-       tmp_str7_nw = str7(nw(iw))
-       tmp_str8_sw = str8(sw(iw))
-  
-       tmp_strintx = uarear(iw)*(str1(iw)+tmp_str2_nw+tmp_str3_se+tmp_str4_sw)
-       tmp_strinty = uarear(iw)*(str5(iw)+tmp_str6_se+tmp_str7_nw+tmp_str8_sw)
-       cc1 = tmp_strintx + forcex(iw) + taux &
-           + umassdti(iw)*(brlx*uold + revp*uvel_init(iw))
-       cc2 = tmp_strinty + forcey(iw) + tauy &
-           + umassdti(iw)*(brlx*vold + revp*vvel_init(iw))
-       uvel(iw) = (cca*cc1 + ccb*cc2) / ab2 
-       vvel(iw) = (cca*cc2 - ccb*cc1) / ab2
-    enddo
-#ifdef _OPENACC
-    !$acc end parallel
+      !$acc end parallel
 #endif
 
-  end subroutine stepu_iter
-  
-!----------------------------------------------------------------------------
-
-  subroutine stepu_last(NA_len, rhow, &
-                    lb,ub,Cw,aiu,uocn,vocn,forcex,forcey,umassdti,fm,uarear,Tbu, &
-                    strintx,strinty,taubx,tauby,                                 &
-                    uvel_init,vvel_init,uvel,vvel,                               &
-                    str1,str2,str3,str4,str5,str6,str7,str8, nw,sw,se,skipme)
-
-    use ice_kinds_mod
-    use ice_constants, only: c0, c1
-    use ice_dyn_shared, only: brlx, revp, seabed_stress
-
-    implicit none
-
-    integer (kind=int_kind), intent(in) :: NA_len
-    real (kind=dbl_kind), intent(in) :: rhow
-    logical(kind=log_kind),intent(in), dimension(:)   :: skipme
-    integer(kind=int_kind),intent(in)   :: lb,ub
-    integer(kind=int_kind),dimension(:), intent(in), contiguous :: nw,sw,se
-    real(kind=dbl_kind),dimension(:), intent(in), contiguous    ::               &
-       uvel_init, vvel_init, aiu, forcex, forcey, umassdti, Tbu,                 &
-       uocn, vocn, fm, uarear,Cw
-    real(kind=DBL_KIND),dimension(:), intent(in), contiguous ::                  &
-       str1,str2,str3,str4,str5,str6,str7,str8
-    real(kind=dbl_kind),dimension(:), intent(inout), contiguous ::               &
-       uvel,vvel, strintx,strinty, taubx,tauby
-    real (kind=dbl_kind), parameter :: &
-           cosw = c1   , & ! cos(ocean turning angle)  ! turning angle = 0
-           sinw = c0   
-
-    !-- local variables
-
-    integer (kind=int_kind) :: iw,il,iu
-    real    (kind=dbl_kind) :: uold, vold, vrel,cca,ccb,ab2,cc1,cc2,taux,tauy,Cb
-    real    (kind=dbl_kind) :: tmp_str2_nw,tmp_str3_se,tmp_str4_sw
-    real    (kind=dbl_kind) :: tmp_str6_se,tmp_str7_nw,tmp_str8_sw
-    real    (kind=dbl_kind) :: waterx,watery
-    real    (kind=dbl_kind) :: u0 = 5.e-5_dbl_kind    ! residual velocity for seabed stress (m/s)
-   
-    character(len=*), parameter :: subname = '(stepu_last)'
-    !---------------------------------------
+   end subroutine stepu_iter
+
+!=======================================================================
+
+   subroutine stepu_last(NA_len, rhow, lb, ub, Cw, aiu, uocn, vocn, &
+      forcex, forcey, umassdti, fm, uarear, Tbu, uvel_init, vvel_init, &
+      uvel, vvel, str1, str2, str3, str4, str5, str6, str7, str8, nw, &
+      sw, sse, skipucell, strintx, strinty, taubx, tauby)
+
+      use ice_kinds_mod
+      use ice_constants, only : c0, c1
+      use ice_dyn_shared, only : brlx, revp, u0, cosw, sinw, &
+          seabed_stress
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: NA_len, lb, ub
+      real(kind=dbl_kind), intent(in) :: rhow
+      logical(kind=log_kind), intent(in), dimension(:) :: skipucell
+      integer(kind=int_kind), dimension(:), intent(in), contiguous :: &
+         nw, sw, sse
+      real(kind=dbl_kind), dimension(:), intent(in), contiguous :: &
+         uvel_init, vvel_init, aiu, forcex, forcey, umassdti, Tbu, &
+         uocn, vocn, fm, uarear, Cw, str1, str2, str3, str4, str5, &
+         str6, str7, str8
+      real(kind=dbl_kind), dimension(:), intent(inout), contiguous :: &
+         uvel, vvel, strintx, strinty, taubx, tauby
+
+      ! local variables
+
+      integer(kind=int_kind) :: iw, il, iu
+      real(kind=dbl_kind) :: uold, vold, vrel, cca, ccb, ab2, cc1, &
+         cc2, taux, tauy, Cb, tmp_str2_nw, tmp_str3_sse, tmp_str4_sw, &
+         tmp_str6_sse, tmp_str7_nw, tmp_str8_sw, waterx, watery
+
+      character(len=*), parameter :: subname = '(stepu_last)'
 
 #ifdef _OPENACC
-    !$acc parallel                                                                 &
-    !$acc present(Cw,aiu,uocn,vocn,forcex,forcey,umassdti,fm,uarear,Tbu,           &
-    !$acc         strintx,strinty,taubx,tauby,uvel_init,vvel_init,nw,sw,se,skipme, &
-    !$acc         str1,str2,str3,str4,str5,str6,str7,str8,uvel,vvel                )
-    !$acc loop 
-    do iw = 1,NA_len
+      !$acc parallel &
+      !$acc present(Cw, aiu, uocn, vocn, forcex, forcey, umassdti, fm, &
+      !$acc    uarear, Tbu, uvel_init, vvel_init, nw, sw, sse, skipucell, &
+      !$acc    str1, str2, str3, str4, str5, str6, str7, str8, uvel, &
+      !$acc    vvel, strintx, strinty, taubx, tauby)
+      !$acc loop
+      do iw = 1, NA_len
 #else
-    call domp_get_domain(lb,ub,il,iu)
-    do iw = il, iu
+      call domp_get_domain(lb, ub, il, iu)
+      do iw = il, iu
 #endif
-       if (skipme(iw)) cycle
-       uold = uvel(iw)
-       vold = vvel(iw)
-       vrel = aiu(iw)*rhow*Cw(iw)*sqrt((uocn(iw)-uold)**2+(vocn(iw)-vold)**2)
-       waterx = uocn(iw)*cosw - vocn(iw)*sinw*sign(c1,fm(iw))
-       watery = vocn(iw)*cosw + uocn(iw)*sinw*sign(c1,fm(iw))
-       taux = vrel*waterx 
-       tauy = vrel*watery
-       Cb  = Tbu(iw) / (sqrt(uold**2 + vold**2) + u0)
-       cca = (brlx + revp)*umassdti(iw) + vrel * cosw + Cb
-       ccb = fm(iw) + sign(c1,fm(iw)) * vrel * sinw 
-       ab2 = cca**2 + ccb**2
-       ! southeast(i,j+1)   = se
-       ! northwest(i+1,j)   = nw
-       ! southwest(i+1,j+1) = sw
-       tmp_str2_nw = str2(nw(iw))
-       tmp_str3_se = str3(se(iw))
-       tmp_str4_sw = str4(sw(iw))
-       tmp_str6_se = str6(se(iw))
-       tmp_str7_nw = str7(nw(iw))
-       tmp_str8_sw = str8(sw(iw))
-  
-       strintx(iw) = uarear(iw)*(str1(iw)+tmp_str2_nw+tmp_str3_se+tmp_str4_sw)
-       strinty(iw) = uarear(iw)*(str5(iw)+tmp_str6_se+tmp_str7_nw+tmp_str8_sw)
-       cc1 = strintx(iw) + forcex(iw) + taux &
-           + umassdti(iw)*(brlx*uold + revp*uvel_init(iw))
-       cc2 = strinty(iw) + forcey(iw) + tauy &
-           + umassdti(iw)*(brlx*vold + revp*vvel_init(iw))
-       uvel(iw) = (cca*cc1 + ccb*cc2) / ab2 
-       vvel(iw) = (cca*cc2 - ccb*cc1) / ab2
-       ! calculate seabed stress component for outputs
-       if ( seabed_stress ) then
-         taubx(iw) = -uvel(iw)*Tbu(iw) / (sqrt(uold**2 + vold**2) + u0)
-         tauby(iw) = -vvel(iw)*Tbu(iw) / (sqrt(uold**2 + vold**2) + u0)
-       endif
-    enddo
+
+         if (skipucell(iw)) cycle
+
+         uold = uvel(iw)
+         vold = vvel(iw)
+
+         vrel = aiu(iw) * rhow * Cw(iw) * sqrt((uocn(iw) - uold)**2 + (vocn(iw) - vold)**2)
+
+         waterx = uocn(iw) * cosw - vocn(iw) * sinw * sign(c1, fm(iw))
+         watery = vocn(iw) * cosw + uocn(iw) * sinw * sign(c1, fm(iw))
+
+         taux = vrel * waterx
+         tauy = vrel * watery
+
+         Cb = Tbu(iw) / (sqrt(uold**2 + vold**2) + u0)
+
+         cca = (brlx + revp) * umassdti(iw) + vrel * cosw + Cb
+         ccb = fm(iw) + sign(c1, fm(iw)) * vrel * sinw
+
+         ab2 = cca**2 + ccb**2
+
+         tmp_str2_nw = str2(nw(iw))
+         tmp_str3_sse = str3(sse(iw))
+         tmp_str4_sw = str4(sw(iw))
+         tmp_str6_sse = str6(sse(iw))
+         tmp_str7_nw = str7(nw(iw))
+         tmp_str8_sw = str8(sw(iw))
+
+         strintx(iw) = uarear(iw) * (str1(iw) + tmp_str2_nw + tmp_str3_sse + tmp_str4_sw)
+         strinty(iw) = uarear(iw) * (str5(iw) + tmp_str6_sse + tmp_str7_nw + tmp_str8_sw)
+
+         cc1 = strintx(iw) + forcex(iw) + taux &
+             + umassdti(iw) * (brlx * uold + revp * uvel_init(iw))
+         cc2 = strinty(iw) + forcey(iw) + tauy &
+             + umassdti(iw) * (brlx * vold + revp * vvel_init(iw))
+
+         uvel(iw) = (cca * cc1 + ccb * cc2) / ab2
+         vvel(iw) = (cca * cc2 - ccb * cc1) / ab2
+
+         ! calculate seabed stress component for outputs
+         if (seabed_stress) then
+            taubx(iw) = -uvel(iw) * Tbu(iw) / (sqrt(uold**2 + vold**2) + u0)
+            tauby(iw) = -vvel(iw) * Tbu(iw) / (sqrt(uold**2 + vold**2) + u0)
+         end if
+
+      end do
 #ifdef _OPENACC
-    !$acc end parallel
+      !$acc end parallel
 #endif
 
-  end subroutine stepu_last
+   end subroutine stepu_last
 
-!----------------------------------------------------------------------------
+!=======================================================================
 
-  subroutine evp1d_halo_update(NAVEL_len,lb,ub,uvel,vvel, halo_parent)
+   subroutine evp1d_halo_update(NAVEL_len, lb, ub, uvel, vvel, &
+      halo_parent)
 
-    use ice_kinds_mod
+      use ice_kinds_mod
 
-    implicit none
+      implicit none
 
-    integer (kind=int_kind), intent(in) :: NAVEL_len
-    integer(kind=int_kind),intent(in)   :: lb,ub
-    integer(kind=int_kind),dimension(:), intent(in), contiguous :: halo_parent
-    real(kind=dbl_kind),dimension(:), intent(inout), contiguous :: uvel,vvel
+      integer(kind=int_kind), intent(in) :: NAVEL_len, lb, ub
+      integer(kind=int_kind), dimension(:), intent(in), contiguous :: &
+         halo_parent
+      real(kind=dbl_kind), dimension(:), intent(inout), contiguous :: &
+         uvel, vvel
 
-    !-- local variables
+      ! local variables
 
-    integer (kind=int_kind) :: iw,il,iu
+      integer (kind=int_kind) :: iw, il, iu
 
-    character(len=*), parameter :: subname = '(evp1d_halo_update)'
-    !---------------------------------------
+      character(len=*), parameter :: subname = '(evp1d_halo_update)'
 
 #ifdef _OPENACC
-    !$acc parallel                                   &
-    !$acc present(uvel,vvel)                         &
-    !$acc loop
-    do iw = 1,NAVEL_len
+      !$acc parallel &
+      !$acc present(uvel, vvel) &
+      !$acc loop
+      do iw = 1, NAVEL_len
+         if (halo_parent(iw) == 0) cycle
+         uvel(iw) = uvel(halo_parent(iw))
+         vvel(iw) = vvel(halo_parent(iw))
+      end do
+      !$acc end parallel
 #else
-    call domp_get_domain(lb,ub,il,iu)
-    do iw = il, iu
-#endif
-       if (halo_parent(iw)==0) cycle
-       uvel(iw) = uvel(halo_parent(iw))
-       vvel(iw) = vvel(halo_parent(iw))
-    enddo
-#ifdef _OPENACC
-    !$acc end parallel
+      call domp_get_domain(lb, ub, il, iu)
+      do iw = il, iu
+         if (halo_parent(iw) == 0) cycle
+         uvel(iw) = uvel(halo_parent(iw))
+         vvel(iw) = vvel(halo_parent(iw))
+      end do
+      call domp_get_domain(ub + 1, NAVEL_len, il, iu)
+      do iw = il, iu
+         if (halo_parent(iw) == 0) cycle
+         uvel(iw) = uvel(halo_parent(iw))
+         vvel(iw) = vvel(halo_parent(iw))
+      end do
 #endif
 
-  end subroutine evp1d_halo_update
-
-!----------------------------------------------------------------------------
-
-!former end module bench_v2
-  
-!===============================================================================
-!----------------------------------------------------------------------------
-
-  subroutine alloc1d(na)
-
-    implicit none
-
-    integer(kind=int_kind),intent(in) :: na
-    integer(kind=int_kind)            :: ierr,nb
-
-    character(len=*), parameter :: subname = '(alloc1d)'
-    !---------------------------------------
-
-    nb=na
-    allocate(                                                      &
-      ! U+T cells
-      ! Helper index for neighbours
-      indj(1:na),indi(1:na),                                       &
-      ee(1:na),ne(1:na),se(1:na),                                  &
-      nw(1:nb),sw(1:nb),sse(1:nb),                                 &
-      skipucell(1:na),                             &
-      ! Grid distances: HTE,HTN + "-1 neighbours" 
-      HTE(1:na),HTN(1:na),                                         &
-      HTEm1(1:na),HTNm1(1:na),                                     &
-      ! T cells
-!v1      dxhy(1:na),dyhx(1:na),cyp(1:na),cxp(1:na),cym(1:na),cxm(1:na),tinyarea(1:na),&
-      strength(1:na),dxt(1:na),dyt(1:na), tarear(1:na),                   &
-      stressp_1(1:na), stressp_2(1:na), stressp_3(1:na), stressp_4(1:na), &
-      stressm_1(1:na), stressm_2(1:na), stressm_3(1:na), stressm_4(1:na), &
-      stress12_1(1:na),stress12_2(1:na),stress12_3(1:na),stress12_4(1:na),&
-      divu(1:na),rdg_conv(1:na),rdg_shear(1:na),shear(1:na),              &
-      ! U cells
-!v1      waterx(1:nb),watery(1:nb),                                          &
-      cdn_ocn(1:nb),aiu(1:nb),uocn(1:nb),vocn(1:nb),                      &
-      forcex(1:nb),forcey(1:nb),Tbu(1:nb),                                &
-      umassdti(1:nb),fm(1:nb),uarear(1:nb),                               &
-      strintx(1:nb),strinty(1:nb),                                        &
-      uvel_init(1:nb),vvel_init(1:nb),                                    &
-      taubx(1:nb),tauby(1:nb),                                            &
-      stat=ierr)
-
-    if (ierr/=0) call abort_ice(subname//': ERROR allocating 1D')
-
-  end subroutine alloc1d
-
-!----------------------------------------------------------------------------
-
-  subroutine alloc1d_navel(navel)
-
-    implicit none
-
-    integer(kind=int_kind),intent(in) :: navel
-    integer(kind=int_kind)            :: ierr
-
-    character(len=*), parameter :: subname = '(alloc1d_navel)'
-    !---------------------------------------
-
-    allocate(                                                            &
-      uvel(1:navel),vvel(1:navel), indij(1:navel), halo_parent(1:navel), &
-      str1(1:navel),str2(1:navel),str3(1:navel),str4(1:navel),           &
-      str5(1:navel),str6(1:navel),str7(1:navel),str8(1:navel),           &
-      stat=ierr)
-    if (ierr/=0) call abort_ice(subname// ': Error allocating 1D navel')
-
-  end subroutine alloc1d_navel
-
-!----------------------------------------------------------------------------
-
-  subroutine dealloc1d
-
-    implicit none
-
-    integer(kind=int_kind) :: ierr
-
-    character(len=*), parameter :: subname = '(dealloc1d)'
-    !---------------------------------------
-
-    deallocate(                                   &
-      ! U+T cells
-      ! Helper index for neighbours
-      indj,indi,                                  &
-      ee,ne,se,                                   &
-      nw,sw,sse,                                  &
-      skipucell,                                  &
-      ! T cells
-      strength,dxt,dyt,tarear,                    &
-      stressp_1, stressp_2, stressp_3, stressp_4, &
-      stressm_1, stressm_2, stressm_3, stressm_4, &
-      stress12_1,stress12_2,stress12_3,stress12_4,&
-      str1, str2,str3,str4,                       &
-      str5, str6,str7,str8,                       &
-      divu,rdg_conv,rdg_shear,shear,              &
-      ! U cells
-      cdn_ocn,aiu,uocn,vocn,                      &
-      forcex,forcey,Tbu,                          &
-      umassdti,fm,uarear,                         &
-      strintx,strinty,                            &
-      uvel_init,vvel_init,                        &
-      taubx,tauby,                                &
-      ! NAVEL 
-      uvel,vvel, indij, halo_parent,              &
-      stat=ierr)
-
-    if (ierr/=0) call abort_ice(subname//': Error de-allocating 1D')
-
-!v1    if (allocated(tinyarea)) then
-!v1      deallocate(                                 &
-!v1        dxhy,dyhx,cyp,cxp,cym,cxm,tinyarea,       &
-!v1        waterx,watery,                            &
-!v1        stat=ierr)
-!v1      if (ierr/=0) call abort_ice(subname//': Error de-allocating 1D, v1')
-!v1    endif
-
-    if (allocated(HTE)) then
-      deallocate(                                 &
-        ! Grid distances: HTE,HTN + "-1 neighbours" 
-        HTE,HTN, HTEm1,HTNm1,                     &
-        stat=ierr)
-      if (ierr/=0) call abort_ice(subname//': Error de-allocating 1D, v2')
-    endif
-
-  end subroutine dealloc1d
-
-!----------------------------------------------------------------------------
-
-  subroutine evp_copyin_v2(nx,ny,nblk,nx_glob,ny_glob,                                &
-       I_HTE,I_HTN,                                                                   &
-!v1       I_dxhy,I_dyhx,I_cyp,I_cxp,I_cym,I_cxm,I_tinyarea,                              &
-!v1       I_waterx,I_watery,                                                             &
-       I_icetmask,I_iceumask,                                                         &
-       I_cdn_ocn,I_aiu,I_uocn,I_vocn,I_forcex,I_forcey,I_Tbu,                         &
-       I_umassdti,I_fm,I_uarear,I_tarear,I_strintx,I_strinty,I_uvel_init,I_vvel_init, &
-       I_strength,I_uvel,I_vvel,I_dxt,I_dyt,                                          &
-        I_stressp_1 ,I_stressp_2, I_stressp_3, I_stressp_4,                           &
-        I_stressm_1 ,I_stressm_2, I_stressm_3, I_stressm_4,                           &
-       I_stress12_1,I_stress12_2,I_stress12_3,I_stress12_4                            )
-
-    use ice_gather_scatter, only: gather_global_ext
-    use ice_domain, only: distrb_info
-    use ice_communicate, only: my_task, master_task
-
-    implicit none
-
-    integer(int_kind), intent(in) :: nx, ny, nblk, nx_glob, ny_glob
-    integer (kind=int_kind),dimension (nx,ny,nblk), intent(in) :: I_icetmask
-    logical (kind=log_kind),dimension (nx,ny,nblk), intent(in) :: I_iceumask
-    real (kind=dbl_kind), dimension(nx,ny,nblk), intent(in)  ::                       &
-       I_HTE,I_HTN,                                                                   &
-!v1       I_dxhy,I_dyhx,I_cyp,I_cxp,I_cym,I_cxm,I_tinyarea,                              &
-!v1       I_waterx,I_watery,                                                             &
-       I_cdn_ocn,I_aiu,I_uocn,I_vocn,I_forcex,I_forcey,I_Tbu,                         &
-       I_umassdti,I_fm,I_uarear,I_tarear,I_strintx,I_strinty,I_uvel_init,I_vvel_init, &
-       I_strength,I_uvel,I_vvel,I_dxt,I_dyt,                                          &
-       I_stressp_1, I_stressp_2, I_stressp_3, I_stressp_4,                            &
-       I_stressm_1, I_stressm_2, I_stressm_3, I_stressm_4,                            &
-       I_stress12_1,I_stress12_2,I_stress12_3,I_stress12_4
-
-    !-- local variables
-
-    integer (kind=int_kind),dimension (nx_glob,ny_glob) :: G_icetmask
-    logical (kind=log_kind),dimension (nx_glob,ny_glob) :: G_iceumask
-    real (kind=dbl_kind), dimension(nx_glob,ny_glob) ::                               &
-       G_HTE,G_HTN,                                                                   &
-!v1       G_dxhy,G_dyhx,G_cyp,G_cxp,G_cym,G_cxm,G_tinyarea,                              &
-!v1       G_waterx,G_watery,                                                             &
-       G_cdn_ocn,G_aiu,G_uocn,G_vocn,G_forcex,G_forcey,G_Tbu,                         &
-       G_umassdti,G_fm,G_uarear,G_tarear,G_strintx,G_strinty,G_uvel_init,G_vvel_init, &
-       G_strength,G_uvel,G_vvel,G_dxt,G_dyt,                                          &
-       G_stressp_1, G_stressp_2, G_stressp_3, G_stressp_4,                            &
-       G_stressm_1, G_stressm_2, G_stressm_3, G_stressm_4,                            &
-       G_stress12_1,G_stress12_2,G_stress12_3,G_stress12_4
-    integer(kind=int_kind) :: na, navel
-
-    character(len=*), parameter :: subname = '(evp_copyin_v2)'
-    !---------------------------------------
-    !-- Gather data into one single block --
-
-    call gather_global_ext(G_icetmask, I_icetmask, master_task, distrb_info)
-    call gather_global_ext(G_iceumask, I_iceumask, master_task, distrb_info)
-    call gather_global_ext(G_HTE, I_HTE, master_task, distrb_info)
-    call gather_global_ext(G_HTN, I_HTN, master_task, distrb_info)
-!v1    call gather_global_ext(G_dxhy, I_dxhy, master_task, distrb_info)
-!v1    call gather_global_ext(G_dyhx, I_dyhx, master_task, distrb_info)
-!v1    call gather_global_ext(G_cyp, I_cyp, master_task, distrb_info)
-!v1    call gather_global_ext(G_cxp, I_cxp, master_task, distrb_info)
-!v1    call gather_global_ext(G_cym, I_cym, master_task, distrb_info)
-!v1    call gather_global_ext(G_cxm, I_cxm, master_task, distrb_info)
-!v1    call gather_global_ext(G_tinyarea, I_tinyarea, master_task, distrb_info)
-!v1    call gather_global_ext(G_waterx, I_waterx, master_task, distrb_info)
-!v1    call gather_global_ext(G_watery, I_watery, master_task, distrb_info)
-    call gather_global_ext(G_cdn_ocn, I_cdn_ocn, master_task, distrb_info)
-    call gather_global_ext(G_aiu, I_aiu, master_task, distrb_info)
-    call gather_global_ext(G_uocn, I_uocn, master_task, distrb_info)
-    call gather_global_ext(G_vocn, I_vocn, master_task, distrb_info)
-    call gather_global_ext(G_forcex, I_forcex, master_task, distrb_info)
-    call gather_global_ext(G_forcey, I_forcey, master_task, distrb_info)
-    call gather_global_ext(G_Tbu, I_Tbu, master_task, distrb_info)
-    call gather_global_ext(G_umassdti, I_umassdti, master_task, distrb_info)
-    call gather_global_ext(G_fm, I_fm, master_task, distrb_info)
-    call gather_global_ext(G_uarear, I_uarear, master_task, distrb_info)
-    call gather_global_ext(G_tarear, I_tarear, master_task, distrb_info)
-    call gather_global_ext(G_strintx, I_strintx, master_task, distrb_info)
-    call gather_global_ext(G_strinty, I_strinty, master_task, distrb_info)
-    call gather_global_ext(G_uvel_init, I_uvel_init, master_task, distrb_info)
-    call gather_global_ext(G_vvel_init, I_vvel_init, master_task, distrb_info)
-    call gather_global_ext(G_strength, I_strength, master_task, distrb_info)
-    call gather_global_ext(G_uvel, I_uvel, master_task, distrb_info)
-    call gather_global_ext(G_vvel, I_vvel, master_task, distrb_info)
-    call gather_global_ext(G_dxt, I_dxt, master_task, distrb_info)
-    call gather_global_ext(G_dyt, I_dyt, master_task, distrb_info)
-    call gather_global_ext(G_stressp_1, I_stressp_1, master_task, distrb_info)
-    call gather_global_ext(G_stressp_2, I_stressp_2, master_task, distrb_info)
-    call gather_global_ext(G_stressp_3, I_stressp_3, master_task, distrb_info)
-    call gather_global_ext(G_stressp_4, I_stressp_4, master_task, distrb_info)
-    call gather_global_ext(G_stressm_1, I_stressm_1, master_task, distrb_info)
-    call gather_global_ext(G_stressm_2, I_stressm_2, master_task, distrb_info)
-    call gather_global_ext(G_stressm_3, I_stressm_3, master_task, distrb_info)
-    call gather_global_ext(G_stressm_4, I_stressm_4, master_task, distrb_info)
-    call gather_global_ext(G_stress12_1, I_stress12_1, master_task, distrb_info)
-    call gather_global_ext(G_stress12_2, I_stress12_2, master_task, distrb_info)
-    call gather_global_ext(G_stress12_3, I_stress12_3, master_task, distrb_info)
-    call gather_global_ext(G_stress12_4, I_stress12_4, master_task, distrb_info)
-
-    !-- All calculations has to be done on the master-task --
-
-    if (my_task == master_task) then
-      !-- Find number of active points and allocate vectors --
-      call calc_na(nx_glob,ny_glob,na,G_icetmask)
-      call alloc1d(na)
-      call calc_2d_indices(nx_glob,ny_glob,na, G_icetmask, G_iceumask)
-      call calc_navel(nx_glob,ny_glob,na,navel)
-      call alloc1d_navel(navel)
-!MHRI    !$OMP PARALLEL DEFAULT(shared) 
-      call numainit(1,na,navel)
-!MHRI    !$OMP END PARALLEL
-      ! Remap 2d to 1d and fill in
-      call convert_2d_1d(nx_glob,ny_glob,na,navel,                                        &
-         G_HTE,G_HTN,                                                                     &
-!v1         G_dxhy,G_dyhx,G_cyp,G_cxp,G_cym,G_cxm,G_tinyarea,                                &
-!v1         G_waterx,G_watery,                                                               &
-         G_cdn_ocn,G_aiu,G_uocn,G_vocn,G_forcex,G_forcey,G_Tbu,                           &
-         G_umassdti,G_fm,G_uarear,G_tarear,G_strintx,G_strinty,G_uvel_init,G_vvel_init,   &
-         G_strength,G_uvel,G_vvel,G_dxt,G_dyt,                                            &
-          G_stressp_1, G_stressp_2, G_stressp_3, G_stressp_4,                             &
-          G_stressm_1, G_stressm_2, G_stressm_3, G_stressm_4,                             &
-         G_stress12_1,G_stress12_2,G_stress12_3,G_stress12_4                              )
-      call calc_halo_parent(nx_glob,ny_glob,na,navel, G_icetmask)
-      NA_len=na
-      NAVEL_len=navel
-    endif
-
-    !-- write check
-!if (1 == 1) then
-!  write(nu_diag,*) subname,' MHRI: INDICES start:'
-!  write(nu_diag,*) 'na,navel  ', na,navel
-!  write(nu_diag,*) 'Min/max ee', minval(ee(1:na)), maxval(ee(1:na))
-!  write(nu_diag,*) 'Min/max ne', minval(ne(1:na)), maxval(ne(1:na))
-!  write(nu_diag,*) 'Min/max se', minval(se(1:na)), maxval(se(1:na))
-!  write(nu_diag,*) 'Min/max nw', minval(nw(1:na)), maxval(nw(1:na))
-!  write(nu_diag,*) 'Min/max sw', minval(sw(1:na)), maxval(sw(1:na))
-!  write(nu_diag,*) 'Min/max sse', minval(sse(1:na)), maxval(sse(1:na))
-!  write(nu_diag,*) subname,' MHRI: INDICES end:'
-!endif
-
-  end subroutine evp_copyin_v2
-
-!----------------------------------------------------------------------------
-
-  subroutine evp_copyout(nx,ny,nblk,nx_glob,ny_glob,                    &
-               I_uvel,I_vvel, I_strintx,I_strinty,                      &
-               I_stressp_1, I_stressp_2, I_stressp_3, I_stressp_4,      &
-               I_stressm_1, I_stressm_2, I_stressm_3, I_stressm_4,      &
-               I_stress12_1,I_stress12_2,I_stress12_3,I_stress12_4,     &
-               I_divu,I_rdg_conv,I_rdg_shear,I_shear,I_taubx,I_tauby    )
-
-    use ice_constants, only : c0
-    use ice_gather_scatter, only: scatter_global_ext
-    use ice_domain, only: distrb_info
-    use ice_communicate, only: my_task, master_task
-
-    implicit none
-
-    integer(int_kind), intent(in) :: nx,ny,nblk, nx_glob,ny_glob
-    real(dbl_kind), dimension(nx,ny,nblk), intent(out) ::       &
-       I_uvel,I_vvel, I_strintx,I_strinty,                      &
-       I_stressp_1, I_stressp_2, I_stressp_3, I_stressp_4,      &
-       I_stressm_1, I_stressm_2, I_stressm_3, I_stressm_4,      &
-       I_stress12_1,I_stress12_2,I_stress12_3,I_stress12_4,     &
-       I_divu,I_rdg_conv, I_rdg_shear,I_shear, I_taubx,I_tauby
-
-    !-- local variables
-
-    real(dbl_kind), dimension(nx_glob,ny_glob) :: &
-       G_uvel,G_vvel, G_strintx,G_strinty,                      &
-       G_stressp_1, G_stressp_2, G_stressp_3, G_stressp_4,      &
-       G_stressm_1, G_stressm_2, G_stressm_3, G_stressm_4,      &
-       G_stress12_1,G_stress12_2,G_stress12_3,G_stress12_4,     &
-       G_divu,G_rdg_conv, G_rdg_shear,G_shear, G_taubx,G_tauby
-    integer(int_kind) :: i,j,iw, nx_block
-
-    character(len=*), parameter :: subname = '(evp_copyout)'
-    !---------------------------------------
-    ! Remap 1d to 2d and fill in
-    nx_block=nx_glob  ! Total block size in x-dir 
-
-    if (my_task == master_task) then
-      G_uvel       = c0
-      G_vvel       = c0
-      G_strintx    = c0
-      G_strinty    = c0
-      G_stressp_1  = c0
-      G_stressp_2  = c0
-      G_stressp_3  = c0
-      G_stressp_4  = c0
-      G_stressm_1  = c0
-      G_stressm_2  = c0
-      G_stressm_3  = c0
-      G_stressm_4  = c0
-      G_stress12_1 = c0
-      G_stress12_2 = c0
-      G_stress12_3 = c0
-      G_stress12_4 = c0
-      G_divu       = c0
-      G_rdg_conv   = c0
-      G_rdg_shear  = c0
-      G_shear      = c0
-      G_taubx      = c0
-      G_tauby      = c0
-      !$OMP PARALLEL PRIVATE(iw,i,j)
-      do iw=1,NAVEL_len
-        j=int((indij(iw)-1)/(nx_block))+1
-        i=indij(iw)-(j-1)*nx_block
-        G_uvel(i,j)       = uvel(iw)
-        G_vvel(i,j)       = vvel(iw)
-      enddo
-      !$OMP END PARALLEL
-      !$OMP PARALLEL PRIVATE(iw,i,j)
-      do iw=1,NA_len
-        i=indi(iw)
-        j=indj(iw)
-!        G_uvel(i,j)       = uvel(iw)  ! done above
-!        G_vvel(i,j)       = vvel(iw)  ! done above
-        G_strintx(i,j)    = strintx(iw)
-        G_strinty(i,j)    = strinty(iw)
-        G_stressp_1(i,j)  = stressp_1(iw)
-        G_stressp_2(i,j)  = stressp_2(iw)
-        G_stressp_3(i,j)  = stressp_3(iw)
-        G_stressp_4(i,j)  = stressp_4(iw)
-        G_stressm_1(i,j)  = stressm_1(iw)
-        G_stressm_2(i,j)  = stressm_2(iw)
-        G_stressm_3(i,j)  = stressm_3(iw)
-        G_stressm_4(i,j)  = stressm_4(iw)
-        G_stress12_1(i,j) = stress12_1(iw)
-        G_stress12_2(i,j) = stress12_2(iw)
-        G_stress12_3(i,j) = stress12_3(iw)
-        G_stress12_4(i,j) = stress12_4(iw)
-        G_divu(i,j)       = divu(iw)
-        G_rdg_conv(i,j)   = rdg_conv(iw)
-        G_rdg_shear(i,j)  = rdg_shear(iw)
-        G_shear(i,j)      = shear(iw)
-        G_taubx(i,j)      = taubx(iw)
-        G_tauby(i,j)      = tauby(iw)
-      enddo
-      !$OMP END PARALLEL
-      call dealloc1d()
-    endif
-
-    !-- Scatter data into blocks --
-    !--   has to be done on all tasks --
-
-    call scatter_global_ext(I_uvel, G_uvel, master_task, distrb_info)
-    call scatter_global_ext(I_vvel, G_vvel, master_task, distrb_info)
-    call scatter_global_ext(I_strintx, G_strintx, master_task, distrb_info)
-    call scatter_global_ext(I_strinty, G_strinty, master_task, distrb_info)
-    call scatter_global_ext(I_stressp_1, G_stressp_1, master_task, distrb_info)
-    call scatter_global_ext(I_stressp_2, G_stressp_2, master_task, distrb_info)
-    call scatter_global_ext(I_stressp_3, G_stressp_3, master_task, distrb_info)
-    call scatter_global_ext(I_stressp_4, G_stressp_4, master_task, distrb_info)
-    call scatter_global_ext(I_stressm_1, G_stressm_1, master_task, distrb_info)
-    call scatter_global_ext(I_stressm_2, G_stressm_2, master_task, distrb_info)
-    call scatter_global_ext(I_stressm_3, G_stressm_3, master_task, distrb_info)
-    call scatter_global_ext(I_stressm_4, G_stressm_4, master_task, distrb_info)
-    call scatter_global_ext(I_stress12_1, G_stress12_1, master_task, distrb_info)
-    call scatter_global_ext(I_stress12_2, G_stress12_2, master_task, distrb_info)
-    call scatter_global_ext(I_stress12_3, G_stress12_3, master_task, distrb_info)
-    call scatter_global_ext(I_stress12_4, G_stress12_4, master_task, distrb_info)
-    call scatter_global_ext(I_divu, G_divu, master_task, distrb_info)
-    call scatter_global_ext(I_rdg_conv, G_rdg_conv, master_task, distrb_info)
-    call scatter_global_ext(I_rdg_shear, G_rdg_shear, master_task, distrb_info)
-    call scatter_global_ext(I_shear, G_shear, master_task, distrb_info)
-    call scatter_global_ext(I_taubx, G_taubx, master_task, distrb_info)
-    call scatter_global_ext(I_tauby, G_tauby, master_task, distrb_info)
-
-  end subroutine evp_copyout
-
-!----------------------------------------------------------------------------
-
-  subroutine evp_kernel_v2
-
-    use ice_constants, only : c0
-    use ice_dyn_shared, only: ndte
-    use ice_communicate, only: my_task, master_task
-    implicit none
-
-    real(kind=dbl_kind) :: rhow
-    integer (kind=int_kind) :: i, nthreads
-    integer (kind=int_kind) :: na,nb,navel
-
-    character(len=*), parameter :: subname = '(evp_kernel_v2)'
-    !---------------------------------------
-    !-- All calculations has to be done on one single node (choose master-task) --
-
-    if (my_task == master_task) then
-
-      !- Read constants...
-      call icepack_query_parameters(rhow_out=rhow)
-      call icepack_warnings_flush(nu_diag)
-      if (icepack_warnings_aborted()) call abort_ice(error_message=subname, &
-          file=__FILE__, line=__LINE__)
-      na=NA_len
-      nb=NA_len
-      navel=NAVEL_len
-
-      !- Initialize openmp ---------------------------------------------------------
-      call domp_init(nthreads) ! ought to be called from main
-  
-      !- Initialize timers ---------------------------------------------------------
-      str1=c0
-      str2=c0
-      str3=c0
-      str4=c0
-      str5=c0
-      str6=c0
-      str7=c0
-      str8=c0
-   
-      if (ndte<2) call abort_ice(subname//' ERROR: ndte must be 2 or higher for this kernel')
-
-      !$OMP PARALLEL PRIVATE(i)
-      do i = 1, ndte-1
-        call evp1d_stress(NA_len,                                                    &
-                     ee,ne,se,1,na,uvel,vvel,dxt,dyt,                                &
-                     hte,htn,htem1,htnm1,                                            &
-                     strength,stressp_1,stressp_2,stressp_3,stressp_4,               &
-                     stressm_1,stressm_2,stressm_3,stressm_4,stress12_1,             &
-                     stress12_2,stress12_3,stress12_4,str1,str2,str3,                &
-                     str4,str5,str6,str7,str8)
-        !$OMP BARRIER
-        call evp1d_stepu(NA_len, rhow,                                               &
-                     1,nb,cdn_ocn,aiu,uocn,vocn,forcex,forcey,umassdti,fm,uarear,Tbu,&
-                     uvel_init,vvel_init,uvel,vvel,                                  &
-                     str1,str2,str3,str4,str5,str6,str7,str8, nw,sw,sse,skipucell)
-        !$OMP BARRIER
-        call evp1d_halo_update(NA_len,1,navel,uvel,vvel, halo_parent)
-        !$OMP BARRIER
-      enddo
-
-      call evp1d_stress(NA_len, tarear,                                              &
-                     ee,ne,se,1,na,uvel,vvel,dxt,dyt,                                &
-                     hte,htn,htem1,htnm1,                                            &
-                     strength,stressp_1,stressp_2,stressp_3,stressp_4,               &
-                     stressm_1,stressm_2,stressm_3,stressm_4,stress12_1,             &
-                     stress12_2,stress12_3,stress12_4,                               &
-                     divu,rdg_conv,rdg_shear,shear,                                  &
-                     str1,str2,str3,str4,str5,str6,str7,str8)
-      !$OMP BARRIER
-      call evp1d_stepu(NA_len, rhow,                                                 &
-                     1,nb,cdn_ocn,aiu,uocn,vocn,forcex,forcey,umassdti,fm,uarear,Tbu,&
-                     strintx,strinty,taubx,tauby,                                    &
-                     uvel_init,vvel_init,uvel,vvel,                                  &
-                     str1,str2,str3,str4,str5,str6,str7,str8, nw,sw,sse,skipucell)
-      !$OMP BARRIER
-      call evp1d_halo_update(NA_len,1,navel,uvel,vvel, halo_parent)
+   end subroutine evp1d_halo_update
+
+!=======================================================================
+
+   subroutine alloc1d(na)
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: na
+
+      ! local variables
+
+      integer(kind=int_kind) :: ierr
+
+      character(len=*), parameter :: subname = '(alloc1d)'
+
+      allocate( &
+         ! helper indices for neighbours
+         indj(1:na), indi(1:na), ee(1:na), ne(1:na), se(1:na), &
+         nw(1:na), sw(1:na), sse(1:na), skipucell(1:na), &
+         skiptcell(1:na), &
+         ! grid distances and their "-1 neighbours"
+         HTE(1:na), HTN(1:na), HTEm1(1:na), HTNm1(1:na), &
+         ! T cells
+         strength(1:na), dxt(1:na), dyt(1:na), tarear(1:na), &
+         stressp_1(1:na), stressp_2(1:na), stressp_3(1:na), &
+         stressp_4(1:na), stressm_1(1:na), stressm_2(1:na), &
+         stressm_3(1:na), stressm_4(1:na), stress12_1(1:na), &
+         stress12_2(1:na), stress12_3(1:na), stress12_4(1:na), &
+         divu(1:na), rdg_conv(1:na), rdg_shear(1:na), shear(1:na), &
+         ! U cells
+         cdn_ocn(1:na), aiu(1:na), uocn(1:na), vocn(1:na), &
+         forcex(1:na), forcey(1:na), Tbu(1:na), umassdti(1:na), &
+         fm(1:na), uarear(1:na), strintx(1:na), strinty(1:na), &
+         uvel_init(1:na), vvel_init(1:na), taubx(1:na), tauby(1:na), &
+         ! error handling
+         stat=ierr)
+
+      if (ierr /= 0) call abort_ice(subname &
+         // ' ERROR: could not allocate 1D arrays')
+
+   end subroutine alloc1d
+
+!=======================================================================
+
+   subroutine alloc1d_navel(navel)
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: navel
+
+      ! local variables
+
+      integer(kind=int_kind) :: ierr
+
+      character(len=*), parameter :: subname = '(alloc1d_navel)'
+
+      allocate(uvel(1:navel), vvel(1:navel), indij(1:navel), &
+         halo_parent(1:navel), str1(1:navel), str2(1:navel), &
+         str3(1:navel), str4(1:navel), str5(1:navel), str6(1:navel), &
+         str7(1:navel), str8(1:navel), stat=ierr)
+
+      if (ierr /= 0) call abort_ice(subname &
+         // ' ERROR: could not allocate 1D arrays')
+
+   end subroutine alloc1d_navel
+
+!=======================================================================
+
+   subroutine dealloc1d
+
+      implicit none
+
+      ! local variables
+
+      integer(kind=int_kind) :: ierr
+
+      character(len=*), parameter :: subname = '(dealloc1d)'
+
+      deallocate( &
+         ! helper indices for neighbours
+         indj, indi, ee, ne, se, nw, sw, sse, skipucell, skiptcell, &
+         ! grid distances and their "-1 neighbours"
+         HTE, HTN, HTEm1, HTNm1, &
+         ! T cells
+         strength, dxt, dyt, tarear, stressp_1, stressp_2, stressp_3, &
+         stressp_4, stressm_1, stressm_2, stressm_3, stressm_4, &
+         stress12_1, stress12_2, stress12_3, stress12_4, str1, str2, &
+         str3, str4, str5, str6, str7, str8, divu, rdg_conv, &
+         rdg_shear, shear, &
+         ! U cells
+         cdn_ocn, aiu, uocn, vocn, forcex, forcey, Tbu, umassdti, fm, &
+         uarear, strintx, strinty, uvel_init, vvel_init, taubx, tauby, &
+         uvel, vvel, indij, halo_parent, &
+         ! error handling
+         stat=ierr)
+
+      if (ierr /= 0) call abort_ice(subname &
+         // ' ERROR: could not deallocate 1D arrays')
+
+   end subroutine dealloc1d
+
+!=======================================================================
+
+   subroutine ice_dyn_evp_1d_copyin(nx, ny, nblk, nx_glob, ny_glob, &
+      I_icetmask, I_iceumask, I_cdn_ocn, I_aiu, I_uocn, I_vocn, &
+      I_forcex, I_forcey, I_Tbu, I_umassdti, I_fm, I_uarear, I_tarear, &
+      I_strintx, I_strinty, I_uvel_init, I_vvel_init, I_strength, &
+      I_uvel, I_vvel, I_dxt, I_dyt, I_stressp_1, I_stressp_2, &
+      I_stressp_3, I_stressp_4, I_stressm_1, I_stressm_2, I_stressm_3, &
+      I_stressm_4, I_stress12_1, I_stress12_2, I_stress12_3, &
+      I_stress12_4)
+
+      use ice_gather_scatter, only : gather_global_ext
+      use ice_domain, only : distrb_info
+      use ice_communicate, only : my_task, master_task
+      use ice_grid, only : G_HTE, G_HTN
+      use ice_constants, only : c0
+
+      implicit none
+
+      integer(int_kind), intent(in) :: nx, ny, nblk, nx_glob, ny_glob
+      logical(kind=log_kind), dimension(nx, ny, nblk), intent(in) :: &
+         I_iceumask
+      integer(kind=int_kind), dimension(nx, ny, nblk), intent(in) :: &
+         I_icetmask
+      real(kind=dbl_kind), dimension(nx, ny, nblk), intent(in) :: &
+         I_cdn_ocn, I_aiu, I_uocn, I_vocn, I_forcex, I_forcey, I_Tbu, &
+         I_umassdti, I_fm, I_uarear, I_tarear, I_strintx, I_strinty, &
+         I_uvel_init, I_vvel_init, I_strength, I_uvel, I_vvel, I_dxt, &
+         I_dyt, I_stressp_1, I_stressp_2, I_stressp_3, I_stressp_4, &
+         I_stressm_1, I_stressm_2, I_stressm_3, I_stressm_4, &
+         I_stress12_1, I_stress12_2, I_stress12_3, I_stress12_4
+
+      ! local variables
+
+      logical(kind=log_kind), dimension(nx_glob, ny_glob) :: &
+         G_iceumask
+      integer(kind=int_kind), dimension(nx_glob, ny_glob) :: &
+         G_icetmask
+      real(kind=dbl_kind), dimension(nx_glob, ny_glob) :: &
+         G_cdn_ocn, G_aiu, G_uocn, G_vocn, G_forcex, G_forcey, G_Tbu, &
+         G_umassdti, G_fm, G_uarear, G_tarear, G_strintx, G_strinty, &
+         G_uvel_init, G_vvel_init, G_strength, G_uvel, G_vvel, G_dxt, &
+         G_dyt, G_stressp_1, G_stressp_2, G_stressp_3, G_stressp_4, &
+         G_stressm_1, G_stressm_2, G_stressm_3, G_stressm_4, &
+         G_stress12_1, G_stress12_2, G_stress12_3, G_stress12_4
+
+      character(len=*), parameter :: &
+         subname = '(ice_dyn_evp_1d_copyin)'
+
+      call gather_global_ext(G_icetmask,   I_icetmask,   master_task, distrb_info    )
+      call gather_global_ext(G_iceumask,   I_iceumask,   master_task, distrb_info    )
+      call gather_global_ext(G_cdn_ocn,    I_cdn_ocn,    master_task, distrb_info    )
+      call gather_global_ext(G_aiu,        I_aiu,        master_task, distrb_info    )
+      call gather_global_ext(G_uocn,       I_uocn,       master_task, distrb_info    )
+      call gather_global_ext(G_vocn,       I_vocn,       master_task, distrb_info    )
+      call gather_global_ext(G_forcex,     I_forcex,     master_task, distrb_info    )
+      call gather_global_ext(G_forcey,     I_forcey,     master_task, distrb_info    )
+      call gather_global_ext(G_Tbu,        I_Tbu,        master_task, distrb_info    )
+      call gather_global_ext(G_umassdti,   I_umassdti,   master_task, distrb_info    )
+      call gather_global_ext(G_fm,         I_fm,         master_task, distrb_info    )
+      call gather_global_ext(G_uarear,     I_uarear,     master_task, distrb_info    )
+      call gather_global_ext(G_tarear,     I_tarear,     master_task, distrb_info    )
+      call gather_global_ext(G_strintx,    I_strintx,    master_task, distrb_info    )
+      call gather_global_ext(G_strinty,    I_strinty,    master_task, distrb_info    )
+      call gather_global_ext(G_uvel_init,  I_uvel_init,  master_task, distrb_info    )
+      call gather_global_ext(G_vvel_init,  I_vvel_init,  master_task, distrb_info    )
+      call gather_global_ext(G_strength,   I_strength,   master_task, distrb_info    )
+      call gather_global_ext(G_uvel,       I_uvel,       master_task, distrb_info, c0)
+      call gather_global_ext(G_vvel,       I_vvel,       master_task, distrb_info, c0)
+      call gather_global_ext(G_dxt,        I_dxt,        master_task, distrb_info    )
+      call gather_global_ext(G_dyt,        I_dyt,        master_task, distrb_info    )
+      call gather_global_ext(G_stressp_1,  I_stressp_1,  master_task, distrb_info    )
+      call gather_global_ext(G_stressp_2,  I_stressp_2,  master_task, distrb_info    )
+      call gather_global_ext(G_stressp_3,  I_stressp_3,  master_task, distrb_info    )
+      call gather_global_ext(G_stressp_4,  I_stressp_4,  master_task, distrb_info    )
+      call gather_global_ext(G_stressm_1,  I_stressm_1,  master_task, distrb_info    )
+      call gather_global_ext(G_stressm_2,  I_stressm_2,  master_task, distrb_info    )
+      call gather_global_ext(G_stressm_3,  I_stressm_3,  master_task, distrb_info    )
+      call gather_global_ext(G_stressm_4,  I_stressm_4,  master_task, distrb_info    )
+      call gather_global_ext(G_stress12_1, I_stress12_1, master_task, distrb_info    )
+      call gather_global_ext(G_stress12_2, I_stress12_2, master_task, distrb_info    )
+      call gather_global_ext(G_stress12_3, I_stress12_3, master_task, distrb_info    )
+      call gather_global_ext(G_stress12_4, I_stress12_4, master_task, distrb_info    )
+
+      ! all calculations id done on master task
+      if (my_task == master_task) then
+         ! find number of active points and allocate 1D vectors
+         call calc_na(nx_glob, ny_glob, NA_len, G_icetmask, G_iceumask)
+         call alloc1d(NA_len)
+         call calc_2d_indices(nx_glob, ny_glob, NA_len, G_icetmask, G_iceumask)
+         call calc_navel(nx_glob, ny_glob, NA_len, NAVEL_len)
+         call alloc1d_navel(NAVEL_len)
+         ! initialize OpenMP. FIXME: ought to be called from main
+         call domp_init()
+         !$OMP PARALLEL DEFAULT(shared)
+         call numainit(1, NA_len, NAVEL_len)
+         !$OMP END PARALLEL
+         ! map 2D arrays to 1D arrays
+         call convert_2d_1d(nx_glob, ny_glob, NA_len, NAVEL_len, &
+            G_HTE, G_HTN, G_cdn_ocn, G_aiu, G_uocn, G_vocn, G_forcex, &
+            G_forcey, G_Tbu, G_umassdti, G_fm, G_uarear, G_tarear, &
+            G_strintx, G_strinty, G_uvel_init, G_vvel_init, &
+            G_strength, G_uvel, G_vvel, G_dxt, G_dyt, G_stressp_1, &
+            G_stressp_2, G_stressp_3, G_stressp_4, G_stressm_1, &
+            G_stressm_2, G_stressm_3, G_stressm_4, G_stress12_1, &
+            G_stress12_2, G_stress12_3, G_stress12_4)
+         call calc_halo_parent(nx_glob, ny_glob, NA_len, NAVEL_len, G_icetmask)
+      end if
+
+   end subroutine ice_dyn_evp_1d_copyin
+
+!=======================================================================
+
+   subroutine ice_dyn_evp_1d_copyout(nx, ny, nblk, nx_glob, ny_glob, &
+      I_uvel, I_vvel, I_strintx, I_strinty, I_stressp_1, I_stressp_2, &
+      I_stressp_3, I_stressp_4, I_stressm_1, I_stressm_2, I_stressm_3, &
+      I_stressm_4, I_stress12_1, I_stress12_2, I_stress12_3, &
+      I_stress12_4, I_divu, I_rdg_conv, I_rdg_shear, I_shear, I_taubx, &
+      I_tauby)
+
+      use ice_constants, only : c0
+      use ice_gather_scatter, only : scatter_global_ext
+      use ice_domain, only : distrb_info
+      use ice_communicate, only : my_task, master_task
+
+      implicit none
+
+      integer(int_kind), intent(in) :: nx, ny, nblk, nx_glob, ny_glob
+      real(dbl_kind), dimension(nx, ny, nblk), intent(out) :: I_uvel, &
+         I_vvel, I_strintx, I_strinty, I_stressp_1, I_stressp_2, &
+         I_stressp_3, I_stressp_4, I_stressm_1, I_stressm_2, &
+         I_stressm_3, I_stressm_4, I_stress12_1, I_stress12_2, &
+         I_stress12_3, I_stress12_4, I_divu, I_rdg_conv, I_rdg_shear, &
+         I_shear, I_taubx, I_tauby
+
+      ! local variables
+
+      integer(int_kind) :: iw, lo, up, j, i
+      real(dbl_kind), dimension(nx_glob, ny_glob) :: G_uvel, G_vvel, &
+         G_strintx, G_strinty, G_stressp_1, G_stressp_2, G_stressp_3, &
+         G_stressp_4, G_stressm_1, G_stressm_2, G_stressm_3, &
+         G_stressm_4, G_stress12_1, G_stress12_2, G_stress12_3, &
+         G_stress12_4, G_divu, G_rdg_conv, G_rdg_shear, G_shear, &
+         G_taubx, G_tauby
+
+      character(len=*), parameter :: &
+         subname = '(ice_dyn_evp_1d_copyout)'
+
+      ! remap 1D arrays into 2D arrays
+      if (my_task == master_task) then
+
+         G_uvel       = c0
+         G_vvel       = c0
+         G_strintx    = c0
+         G_strinty    = c0
+         G_stressp_1  = c0
+         G_stressp_2  = c0
+         G_stressp_3  = c0
+         G_stressp_4  = c0
+         G_stressm_1  = c0
+         G_stressm_2  = c0
+         G_stressm_3  = c0
+         G_stressm_4  = c0
+         G_stress12_1 = c0
+         G_stress12_2 = c0
+         G_stress12_3 = c0
+         G_stress12_4 = c0
+         G_divu       = c0
+         G_rdg_conv   = c0
+         G_rdg_shear  = c0
+         G_shear      = c0
+         G_taubx      = c0
+         G_tauby      = c0
+
+         !$OMP PARALLEL PRIVATE(iw, lo, up, j, i)
+         call domp_get_domain(1, NA_len, lo, up)
+         do iw = lo, up
+            ! get 2D indices
+            i = indi(iw)
+            j = indj(iw)
+            ! remap
+            G_strintx(i, j)    = strintx(iw)
+            G_strinty(i, j)    = strinty(iw)
+            G_stressp_1(i, j)  = stressp_1(iw)
+            G_stressp_2(i, j)  = stressp_2(iw)
+            G_stressp_3(i, j)  = stressp_3(iw)
+            G_stressp_4(i, j)  = stressp_4(iw)
+            G_stressm_1(i, j)  = stressm_1(iw)
+            G_stressm_2(i, j)  = stressm_2(iw)
+            G_stressm_3(i, j)  = stressm_3(iw)
+            G_stressm_4(i, j)  = stressm_4(iw)
+            G_stress12_1(i, j) = stress12_1(iw)
+            G_stress12_2(i, j) = stress12_2(iw)
+            G_stress12_3(i, j) = stress12_3(iw)
+            G_stress12_4(i, j) = stress12_4(iw)
+            G_divu(i, j)       = divu(iw)
+            G_rdg_conv(i, j)   = rdg_conv(iw)
+            G_rdg_shear(i, j)  = rdg_shear(iw)
+            G_shear(i, j)      = shear(iw)
+            G_taubx(i, j)      = taubx(iw)
+            G_tauby(i, j)      = tauby(iw)
+            G_uvel(i, j) = uvel(iw)
+            G_vvel(i, j) = vvel(iw)
+         end do
+         call domp_get_domain(NA_len + 1, NAVEL_len, lo, up)
+         do iw = lo, up
+            ! get 2D indices
+            j = int((indij(iw) - 1) / (nx_glob)) + 1
+            i = indij(iw) - (j - 1) * nx_glob
+            ! remap
+            G_uvel(i, j) = uvel(iw)
+            G_vvel(i, j) = vvel(iw)
+         end do
+         !$OMP END PARALLEL
+
+         call dealloc1d()
+
+      end if
+
+      ! scatter data on all tasks
+      call scatter_global_ext(I_uvel,       G_uvel,       master_task, distrb_info)
+      call scatter_global_ext(I_vvel,       G_vvel,       master_task, distrb_info)
+      call scatter_global_ext(I_strintx,    G_strintx,    master_task, distrb_info)
+      call scatter_global_ext(I_strinty,    G_strinty,    master_task, distrb_info)
+      call scatter_global_ext(I_stressp_1,  G_stressp_1,  master_task, distrb_info)
+      call scatter_global_ext(I_stressp_2,  G_stressp_2,  master_task, distrb_info)
+      call scatter_global_ext(I_stressp_3,  G_stressp_3,  master_task, distrb_info)
+      call scatter_global_ext(I_stressp_4,  G_stressp_4,  master_task, distrb_info)
+      call scatter_global_ext(I_stressm_1,  G_stressm_1,  master_task, distrb_info)
+      call scatter_global_ext(I_stressm_2,  G_stressm_2,  master_task, distrb_info)
+      call scatter_global_ext(I_stressm_3,  G_stressm_3,  master_task, distrb_info)
+      call scatter_global_ext(I_stressm_4,  G_stressm_4,  master_task, distrb_info)
+      call scatter_global_ext(I_stress12_1, G_stress12_1, master_task, distrb_info)
+      call scatter_global_ext(I_stress12_2, G_stress12_2, master_task, distrb_info)
+      call scatter_global_ext(I_stress12_3, G_stress12_3, master_task, distrb_info)
+      call scatter_global_ext(I_stress12_4, G_stress12_4, master_task, distrb_info)
+      call scatter_global_ext(I_divu,       G_divu,       master_task, distrb_info)
+      call scatter_global_ext(I_rdg_conv,   G_rdg_conv,   master_task, distrb_info)
+      call scatter_global_ext(I_rdg_shear,  G_rdg_shear,  master_task, distrb_info)
+      call scatter_global_ext(I_shear,      G_shear,      master_task, distrb_info)
+      call scatter_global_ext(I_taubx,      G_taubx,      master_task, distrb_info)
+      call scatter_global_ext(I_tauby,      G_tauby,      master_task, distrb_info)
+
+   end subroutine ice_dyn_evp_1d_copyout
+
+!=======================================================================
+
+   subroutine ice_dyn_evp_1d_kernel
+
+      use ice_constants, only : c0
+      use ice_dyn_shared, only : ndte
+      use ice_communicate, only : my_task, master_task
+
+      implicit none
+
+      ! local variables
+
+      real(kind=dbl_kind) :: rhow
+      integer(kind=int_kind) :: ksub
+
+      character(len=*), parameter :: &
+         subname = '(ice_dyn_evp_1d_kernel)'
+
+      ! all calculations is done on master task
+      if (my_task == master_task) then
+
+         ! read constants
+         call icepack_query_parameters(rhow_out = rhow)
+         call icepack_warnings_flush(nu_diag)
+         if (icepack_warnings_aborted()) then
+            call abort_ice(error_message=subname, file=__FILE__, &
+               line=__LINE__)
+         end if
+
+         if (ndte < 2) call abort_ice(subname &
+            // ' ERROR: ndte must be 2 or higher for this kernel')
+
+         !$OMP PARALLEL PRIVATE(ksub)
+         do ksub = 1, ndte - 1
+            call evp1d_stress(NA_len, ee, ne, se, 1, NA_len, uvel, &
+               vvel, dxt, dyt, hte, htn, htem1, htnm1, strength, &
+               stressp_1, stressp_2, stressp_3, stressp_4, stressm_1, &
+               stressm_2, stressm_3, stressm_4, stress12_1, &
+               stress12_2, stress12_3, stress12_4, str1, str2, str3, &
+               str4, str5, str6, str7, str8, skiptcell)
+            !$OMP BARRIER
+            call evp1d_stepu(NA_len, rhow, 1, NA_len, cdn_ocn, aiu, &
+               uocn, vocn, forcex, forcey, umassdti, fm, uarear, Tbu, &
+               uvel_init, vvel_init, uvel, vvel, str1, str2, str3, &
+               str4, str5, str6, str7, str8, nw, sw, sse, skipucell)
+            !$OMP BARRIER
+            call evp1d_halo_update(NAVEL_len, 1, NA_len, uvel, vvel, &
+               halo_parent)
+            !$OMP BARRIER
+         end do
+
+         call evp1d_stress(NA_len, ee, ne, se, 1, NA_len, uvel, vvel, &
+            dxt, dyt, hte, htn, htem1, htnm1, strength, stressp_1, &
+            stressp_2, stressp_3, stressp_4, stressm_1, stressm_2, &
+            stressm_3, stressm_4, stress12_1, stress12_2, stress12_3, &
+            stress12_4, str1, str2, str3, str4, str5, str6, str7, &
+            str8, skiptcell, tarear, divu, rdg_conv, rdg_shear, shear)
+         !$OMP BARRIER
+         call evp1d_stepu(NA_len, rhow, 1, NA_len, cdn_ocn, aiu, uocn, &
+            vocn, forcex, forcey, umassdti, fm, uarear, Tbu, &
+            uvel_init, vvel_init, uvel, vvel, str1, str2, str3, str4, &
+            str5, str6, str7, str8, nw, sw, sse, skipucell, strintx, &
+            strinty, taubx, tauby)
+         !$OMP BARRIER
+         call evp1d_halo_update(NAVEL_len, 1, NA_len, uvel, vvel, &
+            halo_parent)
+         !$OMP END PARALLEL
+
+      end if  ! master task
+
+   end subroutine ice_dyn_evp_1d_kernel
+
+!=======================================================================
+
+   subroutine calc_na(nx, ny, na, icetmask, iceumask)
+      ! Calculate number of active points
+
+      use ice_blocks, only : nghost
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: nx, ny
+      integer(kind=int_kind), dimension(nx, ny), intent(in) :: &
+         icetmask
+      logical(kind=log_kind), dimension(nx, ny), intent(in) :: &
+         iceumask
+      integer(kind=int_kind), intent(out) :: na
+
+      ! local variables
+
+      integer(kind=int_kind) :: i, j
+
+      character(len=*), parameter :: subname = '(calc_na)'
+
+      na = 0
+      ! NOTE: T mask includes northern and eastern ghost cells
+      do j = 1 + nghost, ny
+         do i = 1 + nghost, nx
+            if (icetmask(i,j) == 1 .or. iceumask(i,j)) na = na + 1
+         end do
+      end do
+
+   end subroutine calc_na
+
+!=======================================================================
+
+   subroutine calc_2d_indices(nx, ny, na, icetmask, iceumask)
+
+      use ice_blocks, only : nghost
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: nx, ny, na
+      integer(kind=int_kind), dimension(nx, ny), intent(in) :: &
+         icetmask
+      logical(kind=log_kind), dimension(nx, ny), intent(in) :: &
+         iceumask
+
+      ! local variables
+
+      integer(kind=int_kind) :: i, j, Nmaskt
+
+      character(len=*), parameter :: subname = '(calc_2d_indices)'
+
+      skipucell(:) = .false.
+      skiptcell(:) = .false.
+      indi = 0
+      indj = 0
+      Nmaskt = 0
+      ! NOTE: T mask includes northern and eastern ghost cells
+      do j = 1 + nghost, ny
+         do i = 1 + nghost, nx
+            if (icetmask(i,j) == 1 .or. iceumask(i,j)) then
+               Nmaskt = Nmaskt + 1
+               indi(Nmaskt) = i
+               indj(Nmaskt) = j
+               if (icetmask(i,j) /= 1)  skiptcell(Nmaskt) = .true.
+               if (.not. iceumask(i,j)) skipucell(Nmaskt) = .true.
+               ! NOTE: U mask does not include northern and eastern
+               ! ghost cells. Skip northern and eastern ghost cells
+               if (i == nx) skipucell(Nmaskt) = .true.
+               if (j == ny) skipucell(Nmaskt) = .true.
+            end if
+         end do
+      end do
+
+   end subroutine calc_2d_indices
+
+!=======================================================================
+
+   subroutine calc_navel(nx_block, ny_block, na, navel)
+      ! Calculate number of active points, including halo points
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: nx_block, ny_block, na
+      integer(kind=int_kind), intent(out) :: navel
+
+      ! local variables
+
+      integer(kind=int_kind) :: iw, i, j
+      integer(kind=int_kind), dimension(1:na) :: Iin, Iee, Ine, Ise, &
+         Inw, Isw, Isse
+      integer(kind=int_kind), dimension(1:7 * na) :: util1, util2
+
+      character(len=*), parameter :: subname = '(calc_navel)'
+
+      ! calculate additional 1D indices used for finite differences
+      do iw = 1, na
+         ! get 2D indices
+         i = indi(iw)
+         j = indj(iw)
+         ! calculate 1D indices
+         Iin(iw)  = i     + (j - 1) * nx_block  ! ( 0,  0) target point
+         Iee(iw)  = i - 1 + (j - 1) * nx_block  ! (-1,  0)
+         Ine(iw)  = i - 1 + (j - 2) * nx_block  ! (-1, -1)
+         Ise(iw)  = i     + (j - 2) * nx_block  ! ( 0, -1)
+         Inw(iw)  = i + 1 + (j - 1) * nx_block  ! (+1,  0)
+         Isw(iw)  = i + 1 + (j - 0) * nx_block  ! (+1, +1)
+         Isse(iw) = i +     (j - 0) * nx_block  ! ( 0, +1)
+      end do
+
+      ! find number of points needed for finite difference calculations
+      call union(Iin,   Iee,  na, na, util1, i    )
+      call union(util1, Ine,  i,  na, util2, j    )
+      call union(util2, Ise,  j,  na, util1, i    )
+      call union(util1, Inw,  i,  na, util2, j    )
+      call union(util2, Isw,  j,  na, util1, i    )
+      call union(util1, Isse, i,  na, util2, navel)
+
+   end subroutine calc_navel
+
+!=======================================================================
+
+   subroutine convert_2d_1d(nx, ny, na, navel, I_HTE, I_HTN, &
+      I_cdn_ocn, I_aiu, I_uocn, I_vocn, I_forcex, I_forcey, I_Tbu, &
+      I_umassdti, I_fm, I_uarear, I_tarear, I_strintx, I_strinty, &
+      I_uvel_init, I_vvel_init, I_strength, I_uvel, I_vvel, I_dxt, &
+      I_dyt, I_stressp_1, I_stressp_2, I_stressp_3, I_stressp_4, &
+      I_stressm_1, I_stressm_2, I_stressm_3, I_stressm_4, &
+      I_stress12_1, I_stress12_2, I_stress12_3, I_stress12_4)
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: nx, ny, na, navel
+      real (kind=dbl_kind), dimension(nx, ny), intent(in) :: I_HTE, &
+         I_HTN, I_cdn_ocn, I_aiu, I_uocn, I_vocn, I_forcex, I_forcey, &
+         I_Tbu, I_umassdti, I_fm, I_uarear, I_tarear, I_strintx, &
+         I_strinty, I_uvel_init, I_vvel_init, I_strength, I_uvel, &
+         I_vvel, I_dxt, I_dyt, I_stressp_1, I_stressp_2, I_stressp_3, &
+         I_stressp_4, I_stressm_1, I_stressm_2, I_stressm_3, &
+         I_stressm_4, I_stress12_1, I_stress12_2, I_stress12_3, &
+         I_stress12_4
+
+      ! local variables
+
+      integer(kind=int_kind) :: iw, lo, up, j, i, nachk
+      integer(kind=int_kind), dimension(1:na) :: Iin, Iee, Ine, Ise, &
+         Inw, Isw, Isse
+      integer(kind=int_kind), dimension(1:7 * na) :: util1, util2
+
+      character(len=*), parameter :: subname = '(convert_2d_1d)'
+
+      ! calculate additional 1D indices used for finite differences
+      do iw = 1, na
+         ! get 2D indices
+         i = indi(iw)
+         j = indj(iw)
+         ! calculate 1D indices
+         Iin(iw)  = i     + (j - 1) * nx  ! ( 0, 0) target point
+         Iee(iw)  = i - 1 + (j - 1) * nx  ! (-1, 0)
+         Ine(iw)  = i - 1 + (j - 2) * nx  ! (-1,-1)
+         Ise(iw)  = i     + (j - 2) * nx  ! ( 0,-1)
+         Inw(iw)  = i + 1 + (j - 1) * nx  ! (+1, 0)
+         Isw(iw)  = i + 1 + (j - 0) * nx  ! (+1,+1)
+         Isse(iw) = i     + (j - 0) * nx  ! ( 0,+1)
+      end do
+
+      ! find number of points needed for finite difference calculations
+      call union(Iin,   Iee,  na, na, util1, i    )
+      call union(util1, Ine,  i,  na, util2, j    )
+      call union(util2, Ise,  j,  na, util1, i    )
+      call union(util1, Inw,  i,  na, util2, j    )
+      call union(util2, Isw,  j,  na, util1, i    )
+      call union(util1, Isse, i,  na, util2, nachk)
+
+      ! index vector with sorted target points
+      do iw = 1, na
+         indij(iw) = Iin(iw)
+      end do
+
+      ! sorted additional points
+      call setdiff(util2, Iin, navel, na, util1, j)
+      do iw = na + 1, navel
+         indij(iw) = util1(iw - na)
+      end do
+
+      ! indices for additional points needed for uvel and vvel
+      call findXinY(Iee,  indij, na, navel, ee)
+      call findXinY(Ine,  indij, na, navel, ne)
+      call findXinY(Ise,  indij, na, navel, se)
+      call findXinY(Inw,  indij, na, navel, nw)
+      call findXinY(Isw,  indij, na, navel, sw)
+      call findXinY(Isse, indij, na, navel, sse)
+
+      !$OMP PARALLEL PRIVATE(iw, lo, up, j, i)
+      ! write 1D arrays from 2D arrays (target points)
+      call domp_get_domain(1, na, lo, up)
+      do iw = lo, up
+         ! get 2D indices
+         i = indi(iw)
+         j = indj(iw)
+         ! map
+         uvel(iw)       = I_uvel(i, j)
+         vvel(iw)       = I_vvel(i, j)
+         cdn_ocn(iw)    = I_cdn_ocn(i, j)
+         aiu(iw)        = I_aiu(i, j)
+         uocn(iw)       = I_uocn(i, j)
+         vocn(iw)       = I_vocn(i, j)
+         forcex(iw)     = I_forcex(i, j)
+         forcey(iw)     = I_forcey(i, j)
+         Tbu(iw)        = I_Tbu(i, j)
+         umassdti(iw)   = I_umassdti(i, j)
+         fm(iw)         = I_fm(i, j)
+         tarear(iw)     = I_tarear(i, j)
+         uarear(iw)     = I_uarear(i, j)
+         strintx(iw)    = I_strintx(i, j)
+         strinty(iw)    = I_strinty(i, j)
+         uvel_init(iw)  = I_uvel_init(i, j)
+         vvel_init(iw)  = I_vvel_init(i, j)
+         strength(iw)   = I_strength(i, j)
+         dxt(iw)        = I_dxt(i, j)
+         dyt(iw)        = I_dyt(i, j)
+         stressp_1(iw)  = I_stressp_1(i, j)
+         stressp_2(iw)  = I_stressp_2(i, j)
+         stressp_3(iw)  = I_stressp_3(i, j)
+         stressp_4(iw)  = I_stressp_4(i, j)
+         stressm_1(iw)  = I_stressm_1(i, j)
+         stressm_2(iw)  = I_stressm_2(i, j)
+         stressm_3(iw)  = I_stressm_3(i, j)
+         stressm_4(iw)  = I_stressm_4(i, j)
+         stress12_1(iw) = I_stress12_1(i, j)
+         stress12_2(iw) = I_stress12_2(i, j)
+         stress12_3(iw) = I_stress12_3(i, j)
+         stress12_4(iw) = I_stress12_4(i, j)
+         HTE(iw)        = I_HTE(i, j)
+         HTN(iw)        = I_HTN(i, j)
+         HTEm1(iw)      = I_HTE(i - 1, j)
+         HTNm1(iw)      = I_HTN(i, j - 1)
+      end do
+      ! write 1D arrays from 2D arrays (additional points)
+      call domp_get_domain(na + 1, navel, lo, up)
+      do iw = lo, up
+         ! get 2D indices
+         j = int((indij(iw) - 1) / (nx)) + 1
+         i = indij(iw) - (j - 1) * nx
+         ! map
+         uvel(iw) = I_uvel(i, j)
+         vvel(iw) = I_vvel(i, j)
+      end do
       !$OMP END PARALLEL
 
-    endif
-
-  end subroutine evp_kernel_v2
-
-!----------------------------------------------------------------------------
-
-  subroutine calc_na(nx,ny,na,icetmask)
-    ! Calculate number of active points (na)
-    use ice_blocks, only: nghost
-
-    implicit none
-
-    integer(int_kind),intent(in) :: nx,ny
-    integer(int_kind),intent(out) :: na
-    integer (kind=int_kind),dimension (nx,ny), intent(in) :: icetmask
-    integer(int_kind) :: i,j
-
-    character(len=*), parameter :: subname = '(calc_na)'
-    !---------------------------------------
-
-    na = 0
-! Note: The icellt mask includes north and east ghost cells. (ice_dyn_shared.F90)
-    do j = 1+nghost, ny ! -nghost
-    do i = 1+nghost, nx ! -nghost
-        if (icetmask(i,j)==1) then 
-          na=na+1
-        endif
-    enddo
-    enddo
-
-  end subroutine calc_na
-
-!----------------------------------------------------------------------------
-
-  subroutine calc_2d_indices(nx,ny,na,icetmask,iceumask)
-
-    use ice_blocks, only: nghost
-
-    implicit none
-
-    integer(int_kind),intent(in) :: nx,ny,na
-    integer (kind=int_kind),dimension (nx,ny), intent(in) :: icetmask
-    logical (kind=log_kind),dimension (nx,ny), intent(in) :: iceumask
-    integer(int_kind) :: i,j,Nmaskt
-
-    character(len=*), parameter :: subname = '(calc_2d_indices)'
-    !---------------------------------------
-
-    skipucell(:)=.false.
-    indi=0
-    indj=0
-    Nmaskt=0
-! Note: The icellt mask includes north and east ghost cells. (ice_dyn_shared.F90)
-    do j = 1+nghost, ny ! -nghost
-    do i = 1+nghost, nx ! -nghost
-        if (icetmask(i,j)==1) then
-          Nmaskt=Nmaskt+1
-          indi(Nmaskt) = i
-          indj(Nmaskt) = j
-          ! Umask do NOT include north/east ghost cells ... skip these as well
-          if (iceumask(i,j) .eqv. .false. ) skipucell(Nmaskt) = .true.
-          if (i==nx) skipucell(Nmaskt) = .true.
-          if (j==ny) skipucell(Nmaskt) = .true.
-        endif
-    enddo
-    enddo
-    if (Nmaskt.ne.na) then
-      write(nu_diag,*) subname,' Nmaskt,na: ',Nmaskt,na
-      call abort_ice(subname//': ERROR Problem Nmaskt != na')
-    endif
-    if (Nmaskt==0) then
-      write(nu_diag,*) subname,' WARNING: NO ICE'
-    endif
-
-  end subroutine calc_2d_indices
-
-!----------------------------------------------------------------------------
-
-  subroutine calc_navel(nx_block,ny_block,na,navel)
-    ! Calculate number of active points including needed halo points (navel)
-
-    implicit none
-
-    integer(int_kind),intent(in)  :: nx_block,ny_block,na
-    integer(int_kind),intent(out) :: navel
-
-    integer(int_kind) :: iw,i,j
-    integer(int_kind),dimension(1:na) :: Iin,Iee,Ine,Ise,Inw,Isw,Isse
-    integer(int_kind),dimension(1:7*na) :: util1,util2
-
-    character(len=*), parameter :: subname = '(calc_navel)'
-
-    !---------------------------------------
-    ! Additional indices used for finite differences (FD)
-    do iw=1,na
-      i=indi(iw)
-      j=indj(iw)
-      Iin(iw) = i   + (j-1)*nx_block  ! ( 0, 0) Target point
-      Iee(iw) = i-1 + (j-1)*nx_block  ! (-1, 0)
-      Ine(iw) = i-1 + (j-2)*nx_block  ! (-1,-1)
-      Ise(iw) = i   + (j-2)*nx_block  ! ( 0,-1)
-      Inw(iw) = i+1 + (j-1)*nx_block  ! (+1, 0)
-      Isw(iw) = i+1 + (j-0)*nx_block  ! (+1,+1)
-      Isse(iw)= i   + (j-0)*nx_block  ! ( 0,+1)
-    enddo
-
-    !-- Find number of points needed for finite difference calculations
-    call union(Iin,  Iee,na,na,util1,i)
-    call union(util1,Ine, i,na,util2,j)
-    call union(util2,Ise, j,na,util1,i)
-    call union(util1,Inw, i,na,util2,j)
-    call union(util2,Isw, j,na,util1,i)
-    call union(util1,Isse,i,na,util2,navel)
-
-    !-- Check bounds
-    do iw=1,navel
-      if (util2(iw)>nx_block*ny_block .or. util2(iw)<1) then
-        write(nu_diag,*) subname,' nx_block,ny_block,nx_block*ny_block: ',nx_block,ny_block,nx_block*ny_block
-        write(nu_diag,*) subname,' na,navel,iw,util2(iw): ',na,navel,iw,util2(iw)
-        call abort_ice(subname//': Problem with boundary. Check halo zone values')
-      endif
-    enddo
-
-  end subroutine calc_navel
-
-!----------------------------------------------------------------------------
-
-  subroutine convert_2d_1d_v2(nx,ny, na,navel,                &
-       I_HTE,I_HTN,                                           &
-!v1       I_dxhy,I_dyhx,I_cyp,I_cxp,I_cym,I_cxm,I_tinyarea,      &
-!v1       I_waterx,I_watery,                                     &
-       I_cdn_ocn,I_aiu,I_uocn,I_vocn,I_forcex,I_forcey,I_Tbu, &
-       I_umassdti,I_fm,I_uarear,I_tarear,I_strintx,I_strinty, &
-       I_uvel_init,I_vvel_init,                               &
-       I_strength,I_uvel,I_vvel,I_dxt,I_dyt,                  &
-        I_stressp_1 ,I_stressp_2, I_stressp_3, I_stressp_4,   &
-        I_stressm_1 ,I_stressm_2, I_stressm_3, I_stressm_4,   &
-       I_stress12_1,I_stress12_2,I_stress12_3,I_stress12_4    )
-
-    implicit none
-
-    integer(int_kind),intent(in) :: nx,ny,na,navel
-    real (kind=dbl_kind), dimension(nx,ny), intent(in)    ::  &
-       I_HTE,I_HTN,                                           &
-!v1       I_dxhy,I_dyhx,I_cyp,I_cxp,I_cym,I_cxm,I_tinyarea,      &
-!v1       I_waterx,I_watery,                                     &
-       I_cdn_ocn,I_aiu,I_uocn,I_vocn,I_forcex,I_forcey,I_Tbu, &
-       I_umassdti,I_fm,I_uarear,I_tarear,I_strintx,I_strinty, &
-       I_uvel_init,I_vvel_init,                               &
-       I_strength,I_uvel,I_vvel,I_dxt,I_dyt,                  &
-        I_stressp_1 ,I_stressp_2, I_stressp_3, I_stressp_4,   &
-        I_stressm_1 ,I_stressm_2, I_stressm_3, I_stressm_4,   &
-       I_stress12_1,I_stress12_2,I_stress12_3,I_stress12_4
-
-    integer(int_kind) :: iw,i,j, nx_block
-    integer(int_kind),dimension(1:na) :: Iin,Iee,Ine,Ise,Inw,Isw,Isse
-    integer(int_kind),dimension(1:7*na) :: util1,util2
-    integer(int_kind) :: nachk
-
-    character(len=*), parameter :: subname = '(convert_2d_1d_v2)'
-
-    !---------------------------------------
-    ! Additional indices used for finite differences (FD)
-    nx_block=nx  ! Total block size in x-dir 
-    do iw=1,na
-      i=indi(iw)
-      j=indj(iw)
-      Iin(iw) = i   + (j-1)*nx_block  ! ( 0, 0) Target point
-      Iee(iw) = i-1 + (j-1)*nx_block  ! (-1, 0)
-      Ine(iw) = i-1 + (j-2)*nx_block  ! (-1,-1)
-      Ise(iw) = i   + (j-2)*nx_block  ! ( 0,-1)
-      Inw(iw) = i+1 + (j-1)*nx_block  ! (+1, 0)
-      Isw(iw) = i+1 + (j-0)*nx_block  ! (+1,+1)
-      Isse(iw)= i   + (j-0)*nx_block  ! ( 0,+1)
-    enddo
-
-    !-- Find number of points needed for finite difference calculations
-    call union(Iin,  Iee,na,na,util1,i)
-    call union(util1,Ine, i,na,util2,j)
-    call union(util2,Ise, j,na,util1,i)
-    call union(util1,Inw, i,na,util2,j)
-    call union(util2,Isw, j,na,util1,i)
-    call union(util1,Isse,i,na,util2,nachk)
-
-    if (nachk .ne. navel) then
-      write(nu_diag,*) subname,' ERROR: navel badly chosen: na,navel,nachk = ',na,navel,nachk
-      call abort_ice(subname//': ERROR: navel badly chosen')
-    endif
-
-    ! indij: vector with target points (sorted) ...
-    do iw=1,na
-      indij(iw)=Iin(iw)
-    enddo
-
-    ! indij: ... followed by extra points (sorted)
-    call setdiff(util2,Iin,navel,na,util1,j)
-    do iw=na+1,navel
-      indij(iw)=util1(iw-na)
-    enddo
-
-    !-- Create indices for additional points needed for uvel,vvel:
-    call findXinY(Iee ,indij,na,navel, ee)
-    call findXinY(Ine ,indij,na,navel, ne)
-    call findXinY(Ise ,indij,na,navel, se)
-    call findXinY(Inw ,indij,na,navel, nw)
-    call findXinY(Isw ,indij,na,navel, sw)
-    call findXinY(Isse,indij,na,navel,sse)
-
-    !-- write check
-!if (1 == 2) then
-!    write(nu_diag,*) subname,' MHRI: INDICES start:'
-!    write(nu_diag,*) 'Min/max ee', minval(ee), maxval(ee)
-!    write(nu_diag,*) 'Min/max ne', minval(ne), maxval(ne)
-!    write(nu_diag,*) 'Min/max se', minval(se), maxval(se)
-!    write(nu_diag,*) 'Min/max nw', minval(nw), maxval(nw)
-!    write(nu_diag,*) 'Min/max sw', minval(sw), maxval(sw)
-!    write(nu_diag,*) 'Min/max sse',minval(sse),maxval(sse)
-!    write(nu_diag,*) subname,' MHRI: INDICES end:'
-!endif
-
-    ! Write 1D data from 2D: Here only extra FD part, the rest follows...
-    !$OMP PARALLEL DO PRIVATE(iw,i,j)
-    do iw=na+1,navel
-      j=int((indij(iw)-1)/(nx_block))+1
-      i=indij(iw)-(j-1)*nx_block
-      uvel(iw)=      I_uvel(i,j)
-      vvel(iw)=      I_vvel(i,j)
-    enddo
-    !$OMP END PARALLEL DO
-    
-    ! Write 1D data from 2D
-    !$OMP PARALLEL DO PRIVATE(iw,i,j)
-    do iw=1,na
-      i=indi(iw)
-      j=indj(iw)
-      uvel(iw)=      I_uvel(i,j)
-      vvel(iw)=      I_vvel(i,j)
-      cdn_ocn(iw)=   I_cdn_ocn(i,j)  
-      aiu(iw)=       I_aiu(i,j)   
-      uocn(iw)=      I_uocn(i,j)   
-      vocn(iw)=      I_vocn(i,j)   
-      forcex(iw)=    I_forcex(i,j)   
-      forcey(iw)=    I_forcey(i,j)
-      Tbu(iw)=       I_Tbu(i,j)
-      umassdti(iw)=  I_umassdti(i,j)   
-      fm(iw)=        I_fm(i,j)   
-      tarear(iw)=    I_tarear(i,j)   
-      uarear(iw)=    I_uarear(i,j)   
-      strintx(iw)=   I_strintx(i,j)   
-      strinty(iw)=   I_strinty(i,j)   
-      uvel_init(iw)= I_uvel_init(i,j)  
-      vvel_init(iw)= I_vvel_init(i,j)  
-      strength(iw)=  I_strength(i,j)   
-      dxt(iw)=       I_dxt(i,j)   
-      dyt(iw)=       I_dyt(i,j)   
-      stressp_1(iw)= I_stressp_1(i,j)  
-      stressp_2(iw)= I_stressp_2(i,j)  
-      stressp_3(iw)= I_stressp_3(i,j)  
-      stressp_4(iw)= I_stressp_4(i,j)  
-      stressm_1(iw)= I_stressm_1(i,j)  
-      stressm_2(iw)= I_stressm_2(i,j)  
-      stressm_3(iw)= I_stressm_3(i,j)  
-      stressm_4(iw)= I_stressm_4(i,j)  
-      stress12_1(iw)=I_stress12_1(i,j) 
-      stress12_2(iw)=I_stress12_2(i,j) 
-      stress12_3(iw)=I_stress12_3(i,j) 
-      stress12_4(iw)=I_stress12_4(i,j)
-!v1      dxhy(iw)=      I_dxhy(i,j)   
-!v1      dyhx(iw)=      I_dyhx(i,j)   
-!v1      cyp(iw)=       I_cyp(i,j)   
-!v1      cxp(iw)=       I_cxp(i,j)   
-!v1      cym(iw)=       I_cym(i,j)   
-!v1      cxm(iw)=       I_cxm(i,j)   
-!v1      tinyarea(iw)=  I_tinyarea(i,j)   
-!v1      waterx(iw)=    I_waterx(i,j)   
-!v1      watery(iw)=    I_watery(i,j)   
-      HTE(iw)   = I_HTE(i,j)
-      HTN(iw)   = I_HTN(i,j)
-      HTEm1(iw) = I_HTE(i-1,j)
-      HTNm1(iw) = I_HTN(i,j-1)
-    enddo
-    !$OMP END PARALLEL DO
-
-  end subroutine convert_2d_1d_v2
-
-!----------------------------------------------------------------------------
-
-  subroutine calc_halo_parent(nx,ny,na,navel, I_icetmask)
-
-    implicit none
-
-    integer(kind=int_kind),intent(in) :: nx,ny,na,navel
-    integer(kind=int_kind), dimension(nx,ny), intent(in) :: I_icetmask
-
-    integer(kind=int_kind) :: iw,i,j !,masku,maskt
-    integer(kind=int_kind),dimension(1:navel) :: Ihalo
-
-    character(len=*), parameter :: subname = '(calc_halo_parent)'
-
-    !---------------------------------------
-    ! Indices for halo update:
-    !      0: no halo point
-    !     >0: index for halo point parent. Finally related to indij vector
-    ! TODO: ONLY for nghost==1
-    ! TODO: ONLY for circular grids - NOT tripole grids
-
-    Ihalo(:)=0
-    halo_parent(:)=0
-
-    !$OMP PARALLEL DO PRIVATE(iw,i,j)
-    do iw=1,navel
-      j=int((indij(iw)-1)/(nx))+1
-      i=indij(iw)-(j-1)*nx
-      ! If within ghost-zone:
-      if (i==nx .and. I_icetmask(   2,j)==1) Ihalo(iw)=     2+ (j-1)*nx
-      if (i==1  .and. I_icetmask(nx-1,j)==1) Ihalo(iw)=(nx-1)+ (j-1)*nx
-      if (j==ny .and. I_icetmask(i,   2)==1) Ihalo(iw)=     i+       nx
-      if (j==1  .and. I_icetmask(i,ny-1)==1) Ihalo(iw)=     i+(ny-2)*nx
-    enddo
-    !$OMP END PARALLEL DO
-
-    ! Relate halo indices to indij vector
-    call findXinY_halo(Ihalo,indij,navel,navel,halo_parent)
-
-    !-- write check
-!if (1 == 1) then
-!    integer(kind=int_kind) :: iiw,ii,jj !,masku,maskt  MHRI
-!    write(nu_diag,*) subname,' MHRI: halo boundary start:'
-!    do iw=1,navel
-!      if (halo_parent(iw)>0) then
-!      iiw=halo_parent(iw)
-!      j=int((indij(iiw)-1)/(nx))+1
-!      i=indij(iiw)-(j-1)*nx
-!      ii=i
-!      jj=j
-!      j=int((indij(iw)-1)/(nx))+1
-!      i=indij(iw)-(j-1)*nx
-!      write(nu_diag,*)iw,i,j,iiw,ii,jj
-!      endif
-!    enddo
-!    write(nu_diag,*) subname,' MHRI: halo boundary end:'
-!endif
-
-  end subroutine calc_halo_parent
-
-!----------------------------------------------------------------------------
-
-  subroutine union(x,y,nx,ny,xy,nxy)
-    ! Find union (xy) of two sorted integer vectors (x and y)
-    !   ie. Combined values of the two vectors with no repetitions.
-    !use ice_kinds_mod
-
-    implicit none
-
-    integer (int_kind) :: i,j,k
-    integer (int_kind),intent(in)  :: nx,ny
-    integer (int_kind),intent(in)  :: x(1:nx),y(1:ny)
-    integer (int_kind),intent(out) :: xy(1:nx+ny)
-    integer (int_kind),intent(out) :: nxy
-
-    character(len=*), parameter :: subname = '(union)'
-
-    !---------------------------------------
-
-    i=1
-    j=1
-    k=1
-    do while (i<=nx .and. j<=ny)
-      if (x(i)<y(j)) then
-        xy(k)=x(i)
-        i=i+1
-      else if (x(i)>y(j)) then
-        xy(k)=y(j)
-        j=j+1
-      else !if (x(i)==y(j)) then
-        xy(k)=x(i)
-        i=i+1
-        j=j+1
-      endif
-      k=k+1
-    enddo
-
-    ! The rest
-    do while (i<=nx)
-      xy(k)=x(i)
-      i=i+1
-      k=k+1
-    enddo
-    do while (j<=ny)
-      xy(k)=y(j)
-      j=j+1
-      k=k+1
-    enddo
-    nxy=k-1
-
-  end subroutine union
-
-!----------------------------------------------------------------------------
-
-  subroutine setdiff(x,y,nx,ny,xy,nxy)
-    ! Find element (xy) of two sorted integer vectors (x and y)
-    !   that are in x, but not in y ... or in y, but not in x
-    !use ice_kinds_mod
-
-    implicit none
-
-    integer (int_kind) :: i,j,k
-    integer (int_kind),intent(in)  :: nx,ny
-    integer (int_kind),intent(in)  :: x(1:nx),y(1:ny)
-    integer (int_kind),intent(out) :: xy(1:nx+ny)
-    integer (int_kind),intent(out) :: nxy
-
-    character(len=*), parameter :: subname = '(setdiff)'
-    !---------------------------------------
-
-    i=1
-    j=1
-    k=1
-    do while (i<=nx .and. j<=ny)
-      if (x(i)<y(j)) then
-        xy(k)=x(i)
-        i=i+1
-        k=k+1
-      else if (x(i)>y(j)) then
-        xy(k)=y(j)
-        j=j+1
-        k=k+1
-      else !if (x(i)==y(j)) then
-        i=i+1
-        j=j+1
-      endif
-    enddo
-
-    ! The rest
-    do while (i<=nx)
-      xy(k)=x(i)
-      i=i+1
-      k=k+1
-    enddo
-    do while (j<=ny)
-      xy(k)=y(j)
-      j=j+1
-      k=k+1
-    enddo
-    nxy=k-1
-
-  end subroutine setdiff
-
-!----------------------------------------------------------------------------
-
-  subroutine findXinY(x,y,nx,ny,indx)
-    ! Find indx vector so that x(1:na)=y(indx(1:na))
-    !
-    !  Conditions:
-    !   * EVERY item in x is found in y.
-    !   * x(1:nx) is a sorted integer vector.
-    !   * y(1:ny) consists of two sorted integer vectors:
-    !       [y(1:nx) ; y(nx+1:ny)]
-    !   * ny>=nx
-    !  Return: indx(1:na)
-    !
-    !use ice_kinds_mod
-
-    implicit none
-
-    integer (int_kind),intent(in)  :: nx,ny
-    integer (int_kind),intent(in)  :: x(1:nx),y(1:ny)
-    integer (int_kind),intent(out) :: indx(1:nx)
-    integer (int_kind) :: i,j1,j2
-
-    character(len=*), parameter :: subname = '(findXinY)'
-    !---------------------------------------
-
-    i=1
-    j1=1
-    j2=nx+1
-    do while (i<=nx)
-      if (x(i)==y(j1)) then
-        indx(i)=j1
-        i=i+1
-        j1=j1+1
-      else if (x(i)==y(j2)) then
-        indx(i)=j2
-        i=i+1
-        j2=j2+1
-      else if (x(i)>y(j1) ) then !.and. j1<nx) then
-        j1=j1+1
-      else if (x(i)>y(j2) ) then !.and. j2<ny) then
-        j2=j2+1
-      else
-        write(nu_diag,*) subname,' nx,ny: ',nx,ny
-        write(nu_diag,*) subname,' i,j1,j2: ',i,j1,j2
-        write(nu_diag,*) subname,' x(i),y(j1),y(j2): ',x(i),y(j1),y(j2)
-        call abort_ice(subname//': ERROR')
-      endif
-    end do
-
-  end subroutine findXinY
-
-!----------------------------------------------------------------------------
-
-  subroutine findXinY_halo(x,y,nx,ny,indx)
-    ! Find indx vector so that x(1:na)=y(indx(1:na))
-    !
-    !  Conditions:
-    !   * EVERY item in x is found in y.
-    !       Except for x==0, where indx=0 is returned
-    !   * x(1:nx) is an integer vector. Not sorted.
-    !   * y(1:ny) is a sorted integer vector
-    !   * ny>=nx
-    !  Return: indx(1:na)
-    !
-    !use ice_kinds_mod
-
-    implicit none
-
-    integer (int_kind),intent(in)  :: nx,ny
-    integer (int_kind),intent(in)  :: x(1:nx),y(1:ny)
-    integer (int_kind),intent(out) :: indx(1:nx)
-    integer (int_kind) :: i,j1,nloop
-
-    character(len=*), parameter :: subname = '(findXinY_halo)'
-    !---------------------------------------
-
-    nloop=1
-    i=1
-    j1=int((ny+1)/2) ! initial guess in the middle
-    do while (i<=nx)
-      if (x(i)==0) then
-        indx(i)=0
-        i=i+1
-        nloop=1
-      else if (x(i)==y(j1)) then
-        indx(i)=j1
-        i=i+1
-        j1=j1+1
-        if (j1>ny) j1=int((ny+1)/2) ! initial guess in the middle
-        nloop=1
-      else if (x(i)<y(j1) ) then
-        j1=1
-      else if (x(i)>y(j1) ) then
-        j1=j1+1
-        if (j1>ny) then
-          j1=1
-          nloop=nloop+1
-          if (nloop>2) then
-            ! Stop for inf. loop. This check should not be necessary for halo
-            write(nu_diag,*) subname,' nx,ny: ',nx,ny
-            write(nu_diag,*) subname,' i,j1: ',i,j1
-            write(nu_diag,*) subname,' x(i),y(j1): ',x(i),y(j1)
-            call abort_ice(subname//': ERROR too many loops')
-          endif
-        endif
-      endif
-    end do
-
-  end subroutine findXinY_halo
-
-!----------------------------------------------------------------------------
-
-  subroutine numainit(l,u,uu)
-
-    use ice_constants, only: c0
-
-    implicit none
-
-    integer(kind=int_kind),intent(in) :: l,u,uu
-
-    integer(kind=int_kind) :: lo,up
-
-    character(len=*), parameter :: subname = '(numainit)'
-    !---------------------------------------
-
-    call domp_get_domain(l,u,lo,up)
-    ee(lo:up)=0
-    ne(lo:up)=0
-    se(lo:up)=0
-    sse(lo:up)=0
-    nw(lo:up)=0
-    sw(lo:up)=0
-    halo_parent(lo:up)=0
-    strength(lo:up)=c0
-    uvel(lo:up)=c0
-    vvel(lo:up)=c0
-    uvel_init(lo:up)=c0
-    vvel_init(lo:up)=c0
-    uocn(lo:up)=c0
-    vocn(lo:up)=c0
-    dxt(lo:up)=c0
-    dyt(lo:up)=c0
-    HTE(lo:up)=c0
-    HTN(lo:up)=c0
-    HTEm1(lo:up)=c0
-    HTNm1(lo:up)=c0
-!v1    dxhy(lo:up)=c0
-!v1    dyhx(lo:up)=c0
-!v1    cyp(lo:up)=c0
-!v1    cxp(lo:up)=c0
-!v1    cym(lo:up)=c0
-!v1    cxm(lo:up)=c0
-!v1    tinyarea(lo:up)=c0
-    stressp_1(lo:up)=c0
-    stressp_2(lo:up)=c0
-    stressp_3(lo:up)=c0
-    stressp_4(lo:up)=c0
-    stressm_1(lo:up)=c0
-    stressm_2(lo:up)=c0
-    stressm_3(lo:up)=c0
-    stressm_4(lo:up)=c0
-    stress12_1(lo:up)=c0
-    stress12_2(lo:up)=c0
-    stress12_3(lo:up)=c0
-    stress12_4(lo:up)=c0
-    tarear(lo:up)=c0
-    Tbu(lo:up)=c0
-    taubx(lo:up)=c0
-    tauby(lo:up)=c0
-    divu(lo:up)=c0
-    rdg_conv(lo:up)=c0
-    rdg_shear(lo:up)=c0
-    shear(lo:up)=c0
-    str1(lo:up)=c0
-    str2(lo:up)=c0
-    str3(lo:up)=c0
-    str4(lo:up)=c0
-    str5(lo:up)=c0
-    str6(lo:up)=c0
-    str7(lo:up)=c0
-    str8(lo:up)=c0
-    call domp_get_domain(u+1,uu,lo,up)
-    halo_parent(lo:up)=0
-    uvel(lo:up)=c0
-    vvel(lo:up)=c0
-    str1(lo:up)=c0
-    str2(lo:up)=c0
-    str3(lo:up)=c0
-    str4(lo:up)=c0
-    str5(lo:up)=c0
-    str6(lo:up)=c0
-    str7(lo:up)=c0
-    str8(lo:up)=c0
-
-  end subroutine numainit
-  
-!----------------------------------------------------------------------------
-!===============================================================================
+   end subroutine convert_2d_1d
 
-end module ice_dyn_evp_1d
+!=======================================================================
+
+   subroutine calc_halo_parent(nx, ny, na, navel, I_icetmask)
+
+      implicit none
 
+      integer(kind=int_kind), intent(in) :: nx, ny, na, navel
+      integer(kind=int_kind), dimension(nx, ny), intent(in) :: &
+         I_icetmask
+
+      ! local variables
+
+      integer(kind=int_kind) :: iw, i, j
+      integer(kind=int_kind), dimension(1:navel) :: Ihalo
+
+      character(len=*), parameter :: subname = '(calc_halo_parent)'
+
+      !-----------------------------------------------------------------
+      ! Indices for halo update:
+      !     0: no halo point
+      !    >0: index for halo point parent, related to indij vector
+      !
+      ! TODO: Implement for nghost > 1
+      ! TODO: Implement for tripole grids
+      !-----------------------------------------------------------------
+
+      Ihalo(:) = 0
+      halo_parent(:) = 0
+
+      do iw = 1, navel
+         j = int((indij(iw) - 1) / (nx)) + 1
+         i = indij(iw) - (j - 1) * nx
+         ! if within ghost zone
+         if (i == nx .and. I_icetmask(2, j)      == 1) Ihalo(iw) = 2 + (j - 1) * nx
+         if (i == 1  .and. I_icetmask(nx - 1, j) == 1) Ihalo(iw) = (nx - 1) + (j - 1) * nx
+         if (j == ny .and. I_icetmask(i, 2)      == 1) Ihalo(iw) = i + nx
+         if (j == 1  .and. I_icetmask(i, ny - 1) == 1) Ihalo(iw) = i + (ny - 2) * nx
+      end do
+
+      ! relate halo indices to indij vector
+      call findXinY_halo(Ihalo, indij, navel, navel, halo_parent)
+
+   end subroutine calc_halo_parent
+
+!=======================================================================
+
+   subroutine union(x, y, nx, ny, xy, nxy)
+      ! Find union (xy) of two sorted integer vectors (x and y), i.e.
+      ! combined values of the two vectors with no repetitions
+
+      implicit none
+
+      integer(int_kind), intent(in) :: nx, ny
+      integer(int_kind), intent(in) :: x(1:nx), y(1:ny)
+      integer(int_kind), intent(out) :: xy(1:nx + ny)
+      integer(int_kind), intent(out) :: nxy
+
+      ! local variables
+
+      integer(int_kind) :: i, j, k
+
+      character(len=*), parameter :: subname = '(union)'
+
+      i = 1
+      j = 1
+      k = 1
+      do while (i <= nx .and. j <= ny)
+         if (x(i) < y(j)) then
+            xy(k) = x(i)
+            i = i + 1
+         else if (x(i) > y(j)) then
+            xy(k) = y(j)
+            j = j + 1
+         else
+            xy(k) = x(i)
+            i = i + 1
+            j = j + 1
+         end if
+         k = k + 1
+      end do
+
+      ! the rest
+      do while (i <= nx)
+         xy(k) = x(i)
+         i = i + 1
+         k = k + 1
+      end do
+      do while (j <= ny)
+         xy(k) = y(j)
+         j = j + 1
+         k = k + 1
+      end do
+      nxy = k - 1
+
+   end subroutine union
+
+!=======================================================================
+
+   subroutine setdiff(x, y, nx, ny, xy, nxy)
+      ! Find element (xy) of two sorted integer vectors (x and y) that
+      ! are in x, but not in y, or in y, but not in x
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: nx, ny
+      integer(kind=int_kind), intent(in) :: x(1:nx), y(1:ny)
+      integer(kind=int_kind), intent(out) :: xy(1:nx + ny)
+      integer(kind=int_kind), intent(out) :: nxy
+
+      ! local variables
+
+      integer(kind=int_kind) :: i, j, k
+
+      character(len=*), parameter :: subname = '(setdiff)'
+
+      i = 1
+      j = 1
+      k = 1
+      do while (i <= nx .and. j <= ny)
+         if (x(i) < y(j)) then
+            xy(k) = x(i)
+            i = i + 1
+            k = k + 1
+         else if (x(i) > y(j)) then
+            xy(k) = y(j)
+            j = j + 1
+            k = k + 1
+         else
+            i = i + 1
+            j = j + 1
+         end if
+      end do
+
+      ! the rest
+      do while (i <= nx)
+         xy(k) = x(i)
+         i = i + 1
+         k = k + 1
+      end do
+      do while (j <= ny)
+         xy(k) = y(j)
+         j = j + 1
+         k = k + 1
+      end do
+      nxy = k - 1
+
+   end subroutine setdiff
+
+!========================================================================
+
+   subroutine findXinY(x, y, nx, ny, indx)
+      ! Find indx vector so that x(1:na) = y(indx(1:na))
+      !
+      !  Conditions:
+      !   * EVERY item in x is found in y
+      !   * x(1:nx) is a sorted integer vector
+      !   * y(1:ny) consists of two sorted integer vectors:
+      !        [y(1:nx); y(nx + 1:ny)]
+      !   * ny >= nx
+      !
+      !  Return: indx(1:na)
+
+      implicit none
+
+      integer (kind=int_kind), intent(in) :: nx, ny
+      integer (kind=int_kind), intent(in) :: x(1:nx), y(1:ny)
+      integer (kind=int_kind), intent(out) :: indx(1:nx)
+
+      ! local variables
+
+      integer (kind=int_kind) :: i, j1, j2
+
+      character(len=*), parameter :: subname = '(findXinY)'
+
+      i = 1
+      j1 = 1
+      j2 = nx + 1
+      do while (i <= nx)
+         if (x(i) == y(j1)) then
+            indx(i) = j1
+            i = i + 1
+            j1 = j1 + 1
+         else if (x(i) == y(j2)) then
+            indx(i) = j2
+            i = i + 1
+            j2 = j2 + 1
+         else if (x(i) > y(j1)) then
+            j1 = j1 + 1
+         else if (x(i) > y(j2)) then
+            j2 = j2 + 1
+         else
+            call abort_ice(subname &
+               // ': ERROR: conditions not met')
+         end if
+      end do
+
+   end subroutine findXinY
+
+!=======================================================================
+
+   subroutine findXinY_halo(x, y, nx, ny, indx)
+      ! Find indx vector so that x(1:na) = y(indx(1:na))
+      !
+      !  Conditions:
+      !   * EVERY item in x is found in y,
+      !        except for x == 0, where indx = 0 is returned
+      !   * x(1:nx) is a non-sorted integer vector
+      !   * y(1:ny) is a sorted integer vector
+      !   * ny >= nx
+      !
+      !  Return: indx(1:na)
+
+      implicit none
+
+      integer (kind=int_kind), intent(in) :: nx, ny
+      integer (kind=int_kind), intent(in) :: x(1:nx), y(1:ny)
+      integer (kind=int_kind), intent(out) :: indx(1:nx)
+
+      ! local variables
+
+      integer (kind=int_kind) :: i, j1, nloop
+
+      character(len=*), parameter :: subname = '(findXinY_halo)'
+
+      nloop = 1
+      i = 1
+      j1 = int((ny + 1) / 2)  ! initial guess in the middle
+      do while (i <= nx)
+         if (x(i) == 0) then
+            indx(i) = 0
+            i = i + 1
+            nloop = 1
+         else if (x(i) == y(j1)) then
+            indx(i) = j1
+            i = i + 1
+            j1 = j1 + 1
+            ! initial guess in the middle
+            if (j1 > ny) j1 = int((ny + 1) / 2)
+            nloop = 1
+         else if (x(i) < y(j1)) then
+            j1 = 1
+         else if (x(i) > y(j1)) then
+            j1 = j1 + 1
+            if (j1 > ny) then
+               j1 = 1
+               nloop = nloop + 1
+               if (nloop > 2) then
+                  ! stop for infinite loop. This check should not be
+                  ! necessary for halo
+                  call abort_ice(subname // ' ERROR: too many loops')
+               end if
+            end if
+         end if
+      end do
+
+   end subroutine findXinY_halo
+
+!=======================================================================
+
+   subroutine numainit(l, u, uu)
+
+      use ice_constants, only : c0
+
+      implicit none
+
+      integer(kind=int_kind), intent(in) :: l, u, uu
+
+      ! local variables
+
+      integer(kind=int_kind) :: lo, up
+
+      character(len=*), parameter :: subname = '(numainit)'
+
+      call domp_get_domain(l, u, lo, up)
+      ee(lo:up)          = 0
+      ne(lo:up)          = 0
+      se(lo:up)          = 0
+      sse(lo:up)         = 0
+      nw(lo:up)          = 0
+      sw(lo:up)          = 0
+      halo_parent(lo:up) = 0
+      strength(lo:up)    = c0
+      uvel(lo:up)        = c0
+      vvel(lo:up)        = c0
+      uvel_init(lo:up)   = c0
+      vvel_init(lo:up)   = c0
+      uocn(lo:up)        = c0
+      vocn(lo:up)        = c0
+      dxt(lo:up)         = c0
+      dyt(lo:up)         = c0
+      HTE(lo:up)         = c0
+      HTN(lo:up)         = c0
+      HTEm1(lo:up)       = c0
+      HTNm1(lo:up)       = c0
+      stressp_1(lo:up)   = c0
+      stressp_2(lo:up)   = c0
+      stressp_3(lo:up)   = c0
+      stressp_4(lo:up)   = c0
+      stressm_1(lo:up)   = c0
+      stressm_2(lo:up)   = c0
+      stressm_3(lo:up)   = c0
+      stressm_4(lo:up)   = c0
+      stress12_1(lo:up)  = c0
+      stress12_2(lo:up)  = c0
+      stress12_3(lo:up)  = c0
+      stress12_4(lo:up)  = c0
+      tarear(lo:up)      = c0
+      Tbu(lo:up)         = c0
+      taubx(lo:up)       = c0
+      tauby(lo:up)       = c0
+      divu(lo:up)        = c0
+      rdg_conv(lo:up)    = c0
+      rdg_shear(lo:up)   = c0
+      shear(lo:up)       = c0
+      str1(lo:up)        = c0
+      str2(lo:up)        = c0
+      str3(lo:up)        = c0
+      str4(lo:up)        = c0
+      str5(lo:up)        = c0
+      str6(lo:up)        = c0
+      str7(lo:up)        = c0
+      str8(lo:up)        = c0
+
+      call domp_get_domain(u + 1, uu, lo, up)
+      halo_parent(lo:up) = 0
+      uvel(lo:up)        = c0
+      vvel(lo:up)        = c0
+      str1(lo:up)        = c0
+      str2(lo:up)        = c0
+      str3(lo:up)        = c0
+      str4(lo:up)        = c0
+      str5(lo:up)        = c0
+      str6(lo:up)        = c0
+      str7(lo:up)        = c0
+      str8(lo:up)        = c0
+
+   end subroutine numainit
+
+!=======================================================================
+
+end module ice_dyn_evp_1d
diff --git a/cicecore/cicedynB/dynamics/ice_dyn_shared.F90 b/cicecore/cicedynB/dynamics/ice_dyn_shared.F90
old mode 100644
new mode 100755
index f59c27f20..bb65f122c
--- a/cicecore/cicedynB/dynamics/ice_dyn_shared.F90
+++ b/cicecore/cicedynB/dynamics/ice_dyn_shared.F90
@@ -40,13 +40,11 @@ module ice_dyn_shared
          ssh_stress     ! 'geostrophic' or 'coupled'
 
       logical (kind=log_kind), public :: &
-         revised_evp ! if true, use revised evp procedure
+         revised_evp    ! if true, use revised evp procedure
 
-      integer (kind=int_kind), public :: &
-         kevp_kernel ! 0 = 2D org version
-                     ! 1 = 1D representation raw (not implemented)
-                     ! 2 = 1D + calculate distances inline (implemented)
-                     ! 3 = 1D + calculate distances inline + real*4 internal (not implemented yet)
+      character (len=char_len), public :: &
+         evp_algorithm  ! standard_2d = 2D org version (standard)
+                        ! shared_mem_1d = 1d without mpi call and refactorization to 1d 
       ! other EVP parameters
 
       character (len=char_len), public :: & 
@@ -55,12 +53,12 @@ module ice_dyn_shared
                               ! LKD: Lemieux et al. 2015, probabilistic: Dupont et al. in prep.  
                                                                       
       real (kind=dbl_kind), parameter, public :: &
-         eyc = 0.36_dbl_kind, &
-                         ! coefficient for calculating the parameter E
-         cosw = c1   , & ! cos(ocean turning angle)  ! turning angle = 0
-         sinw = c0   , & ! sin(ocean turning angle)  ! turning angle = 0
-         a_min = p001, & ! minimum ice area
-         m_min = p01     ! minimum ice mass (kg/m^2)
+         eyc   = 0.36_dbl_kind, & ! coefficient for calculating the parameter E
+         u0    = 5e-5_dbl_kind, & ! residual velocity for seabed stress (m/s)
+         cosw  = c1           , & ! cos(ocean turning angle)  ! turning angle = 0
+         sinw  = c0           , & ! sin(ocean turning angle)  ! turning angle = 0
+         a_min = p001         , & ! minimum ice area
+         m_min = p01              ! minimum ice mass (kg/m^2)
 
       real (kind=dbl_kind), public :: &
          revp     , & ! 0 for classic EVP, 1 for revised EVP
@@ -91,12 +89,11 @@ module ice_dyn_shared
          seabed_stress ! if true, seabed stress for landfast on
 
       real (kind=dbl_kind), public :: &
-         k1, &        ! 1st free parameter for seabed1 grounding parameterization
-         k2, &        ! second free parameter (N/m^3) for seabed1 grounding parametrization 
-         alphab, &    ! alphab=Cb factor in Lemieux et al 2015
-         threshold_hw, & ! max water depth for grounding 
+         k1          , & ! 1st free parameter for seabed1 grounding parameterization
+         k2          , & ! second free parameter (N/m^3) for seabed1 grounding parametrization
+         alphab      , & ! alphab=Cb factor in Lemieux et al 2015
+         threshold_hw    ! max water depth for grounding
                          ! see keel data from Amundrud et al. 2004 (JGR)
-         u0 = 5e-5_dbl_kind ! residual velocity for seabed stress (m/s)
 
 !=======================================================================
 
diff --git a/cicecore/cicedynB/general/ice_init.F90 b/cicecore/cicedynB/general/ice_init.F90
index b896c3bb9..c0c089ac8 100644
--- a/cicecore/cicedynB/general/ice_init.F90
+++ b/cicecore/cicedynB/general/ice_init.F90
@@ -97,9 +97,10 @@ subroutine input_data
                           bathymetry_file, use_bathymetry, &
                           bathymetry_format, &
                           grid_type, grid_format, &
-                          dxrect, dyrect
+                          dxrect, dyrect, &
+                          pgl_global_ext
       use ice_dyn_shared, only: ndte, kdyn, revised_evp, yield_curve, &
-                                kevp_kernel, &
+                                evp_algorithm, &
                                 seabed_stress, seabed_stress_method, &
                                 k1, k2, alphab, threshold_hw, &
                                 Ktens, e_ratio, coriolis, ssh_stress, &
@@ -201,7 +202,7 @@ subroutine input_data
 
       namelist /dynamics_nml/ &
         kdyn,           ndte,           revised_evp,    yield_curve,    &
-        kevp_kernel,                                                    &
+        evp_algorithm,                                                    &
         brlx,           arlx,           ssh_stress,                     &
         advection,      coriolis,       kridge,         ktransport,     &
         kstrength,      krdg_partic,    krdg_redist,    mu_rdg,         &
@@ -329,7 +330,8 @@ subroutine input_data
       kdyn = 1           ! type of dynamics (-1, 0 = off, 1 = evp, 2 = eap, 3 = vp)
       ndtd = 1           ! dynamic time steps per thermodynamic time step
       ndte = 120         ! subcycles per dynamics timestep:  ndte=dt_dyn/dte
-      kevp_kernel = 0    ! EVP kernel (0 = 2D, >0: 1D. Only ver. 2 is implemented yet)
+      evp_algorithm = 'standard_2d'    ! EVP kernel (=standard_2d: standard cice evp; =shared_mem_1d: 1d shared memory and no mpi. if more mpi processors then executed on master
+      pgl_global_ext = .false. ! if true, init primary grid lengths (global ext.)
       brlx   = 300.0_dbl_kind ! revised_evp values. Otherwise overwritten in ice_dyn_shared
       arlx   = 300.0_dbl_kind ! revised_evp values. Otherwise overwritten in ice_dyn_shared
       revised_evp = .false.  ! if true, use revised procedure for evp dynamics
@@ -669,7 +671,8 @@ subroutine input_data
       call broadcast_scalar(kdyn,                 master_task)
       call broadcast_scalar(ndtd,                 master_task)
       call broadcast_scalar(ndte,                 master_task)
-      call broadcast_scalar(kevp_kernel,          master_task)
+      call broadcast_scalar(evp_algorithm,        master_task)
+      call broadcast_scalar(pgl_global_ext,       master_task)
       call broadcast_scalar(brlx,                 master_task)
       call broadcast_scalar(arlx,                 master_task)
       call broadcast_scalar(revised_evp,          master_task)
@@ -1293,16 +1296,16 @@ subroutine input_data
                   tmpstr2 = ' : revised EVP formulation not used'
                endif
                write(nu_diag,1010) ' revised_evp      = ', revised_evp,trim(tmpstr2)
-
-               if (kevp_kernel == 0) then
-                  tmpstr2 = ' : original EVP solver'
-               elseif (kevp_kernel == 2 .or. kevp_kernel == 102) then
-                  tmpstr2 = ' : vectorized EVP solver'
+       
+               if (evp_algorithm == 'standard_2d') then
+                  tmpstr2 = ' : standard 2d EVP solver'
+               elseif (evp_algorithm == 'shared_mem_1d') then
+                  tmpstr2 = ' : vectorized 1d EVP solver'
+                  pgl_global_ext = .true.
                else
                   tmpstr2 = ' : unknown value'
                endif
-               write(nu_diag,1020) ' kevp_kernel      = ', kevp_kernel,trim(tmpstr2)
-
+               write(nu_diag,1031) ' evp_algorithm    = ', trim(evp_algorithm),trim(tmpstr2)
                write(nu_diag,1020) ' ndtd             = ', ndtd, ' : number of dynamics/advection/ridging/steps per thermo timestep'
                write(nu_diag,1020) ' ndte             = ', ndte, ' : number of EVP or EAP subcycles'
             endif
@@ -1815,19 +1818,11 @@ subroutine input_data
          abort_list = trim(abort_list)//":20"
       endif
 
-      ! check for valid kevp_kernel
-      ! tcraig, kevp_kernel=2 is not validated, do not allow use
-      !         use "102" to test "2" for now
-      if (kevp_kernel /= 0) then
-         if (kevp_kernel == 102) then
-            kevp_kernel = 2
-         else
-            if (my_task == master_task) write(nu_diag,*) subname//' ERROR: kevp_kernel = ',kevp_kernel
-            if (kevp_kernel == 2) then
-                if (my_task == master_task) write(nu_diag,*) subname//' kevp_kernel=2 not validated, use kevp_kernel=102 for testing until it is validated'
-            endif
-            abort_list = trim(abort_list)//":21"
-        endif
+      if (kdyn         == 1                .and. &
+          evp_algorithm /= 'standard_2d'   .and. &
+          evp_algorithm /= 'shared_mem_1d') then 
+         if (my_task == master_task) write(nu_diag,*) subname//' ERROR: unknown evp_algorithm=',trim(evp_algorithm)
+             abort_list = trim(abort_list)//":21" 
       endif
 
       if (abort_list /= "") then
diff --git a/cicecore/cicedynB/infrastructure/comm/mpi/ice_boundary.F90 b/cicecore/cicedynB/infrastructure/comm/mpi/ice_boundary.F90
index 635bbbeb4..3959f12cf 100644
--- a/cicecore/cicedynB/infrastructure/comm/mpi/ice_boundary.F90
+++ b/cicecore/cicedynB/infrastructure/comm/mpi/ice_boundary.F90
@@ -74,7 +74,8 @@ module ice_boundary
              ice_HaloUpdate, &
              ice_HaloUpdate_stress, &
              ice_HaloExtrapolate, &
-             ice_HaloDestroy
+             ice_HaloDestroy, &
+             primary_grid_lengths_global_ext
 
    interface ice_HaloUpdate  ! generic interface
       module procedure ice_HaloUpdate2DR8, &
@@ -6807,6 +6808,136 @@ subroutine ice_HaloDestroy(halo)
    endif
 end subroutine ice_HaloDestroy
 
+!***********************************************************************
+
+ subroutine primary_grid_lengths_global_ext( &
+   ARRAY_O, ARRAY_I, ew_boundary_type, ns_boundary_type)
+
+!  This subroutine adds ghost cells to global primary grid lengths array
+!  ARRAY_I and outputs result to array ARRAY_O
+
+!  Note duplicate implementation of this subroutine in:
+!  cicecore/cicedynB/infrastructure/comm/serial/ice_boundary.F90
+
+   use ice_constants, only: c0
+   use ice_domain_size, only: nx_global, ny_global
+
+   real (kind=dbl_kind), dimension(:,:), intent(in) :: &
+      ARRAY_I
+
+   character (*), intent(in) :: &
+      ew_boundary_type, ns_boundary_type
+
+   real (kind=dbl_kind), dimension(:,:), intent(out) :: &
+      ARRAY_O
+
+!-----------------------------------------------------------------------
+!
+!  local variables
+!
+!-----------------------------------------------------------------------
+
+   integer (kind=int_kind) :: &
+      ii, io, ji, jo
+
+   character(len=*), parameter :: &
+      subname = '(primary_grid_lengths_global_ext)'
+
+!-----------------------------------------------------------------------
+!
+!  add ghost cells to global primary grid lengths array
+!
+!-----------------------------------------------------------------------
+
+   if (trim(ns_boundary_type) == 'tripole' .or. &
+       trim(ns_boundary_type) == 'tripoleT') then
+      call abort_ice(subname//' ERROR: '//ns_boundary_type &
+         //' boundary type not implemented for configuration')
+   endif
+
+   do jo = 1,ny_global+2*nghost
+      ji = -nghost + jo
+
+      !*** Southern ghost cells
+
+      if (ji < 1) then
+         select case (trim(ns_boundary_type))
+         case ('cyclic')
+            ji = ji + ny_global
+         case ('open')
+            ji = nghost - jo + 1
+         case ('closed')
+            ji = 0
+         case default
+            call abort_ice( &
+               subname//' ERROR: unknown north-south boundary type')
+         end select
+      endif
+
+      !*** Northern ghost cells
+
+      if (ji > ny_global) then
+         select case (trim(ns_boundary_type))
+         case ('cyclic')
+            ji = ji - ny_global
+         case ('open')
+            ji = 2 * ny_global - ji + 1
+         case ('closed')
+            ji = 0
+         case default
+            call abort_ice( &
+               subname//' ERROR: unknown north-south boundary type')
+         end select
+      endif
+
+      do io = 1,nx_global+2*nghost
+         ii = -nghost + io
+
+         !*** Western ghost cells
+
+         if (ii < 1) then
+            select case (trim(ew_boundary_type))
+            case ('cyclic')
+               ii = ii + nx_global
+            case ('open')
+               ii = nghost - io + 1
+            case ('closed')
+               ii = 0
+            case default
+               call abort_ice( &
+                  subname//' ERROR: unknown east-west boundary type')
+            end select
+         endif
+
+         !*** Eastern ghost cells
+
+         if (ii > nx_global) then
+            select case (trim(ew_boundary_type))
+            case ('cyclic')
+               ii = ii - nx_global
+            case ('open')
+               ii = 2 * nx_global - ii + 1
+            case ('closed')
+               ii = 0
+            case default
+               call abort_ice( &
+                  subname//' ERROR: unknown east-west boundary type')
+            end select
+         endif
+
+         if (ii == 0 .or. ji == 0) then
+            ARRAY_O(io, jo) = c0
+         else
+            ARRAY_O(io, jo) = ARRAY_I(ii, ji)
+         endif
+
+      enddo
+   enddo
+
+!-----------------------------------------------------------------------
+
+ end subroutine primary_grid_lengths_global_ext
+
 !***********************************************************************
 
 end module ice_boundary
diff --git a/cicecore/cicedynB/infrastructure/comm/mpi/ice_gather_scatter.F90 b/cicecore/cicedynB/infrastructure/comm/mpi/ice_gather_scatter.F90
index 5f0d0af89..0a58769db 100644
--- a/cicecore/cicedynB/infrastructure/comm/mpi/ice_gather_scatter.F90
+++ b/cicecore/cicedynB/infrastructure/comm/mpi/ice_gather_scatter.F90
@@ -1254,7 +1254,7 @@ subroutine gather_global_ext_log(ARRAY_G, ARRAY, dst_task, src_dist, spc_val)
    if (present(spc_val)) then
       special_value = spc_val
    else
-      special_value = .false.   !MHRI NOTE: .true./.false.  ???
+      special_value = .false.
    endif
    ARRAY_G = special_value
 
diff --git a/cicecore/cicedynB/infrastructure/comm/serial/ice_boundary.F90 b/cicecore/cicedynB/infrastructure/comm/serial/ice_boundary.F90
index c66cdd13c..f3fffba59 100644
--- a/cicecore/cicedynB/infrastructure/comm/serial/ice_boundary.F90
+++ b/cicecore/cicedynB/infrastructure/comm/serial/ice_boundary.F90
@@ -61,7 +61,8 @@ module ice_boundary
              ice_HaloUpdate,  &
              ice_HaloUpdate_stress, &
              ice_HaloExtrapolate, &
-             ice_HaloDestroy
+             ice_HaloDestroy, &
+             primary_grid_lengths_global_ext
 
    interface ice_HaloUpdate  ! generic interface
       module procedure ice_HaloUpdate2DR8, &
@@ -4587,6 +4588,136 @@ subroutine ice_HaloDestroy(halo)
 
 end subroutine ice_HaloDestroy
 
+!***********************************************************************
+
+ subroutine primary_grid_lengths_global_ext( &
+   ARRAY_O, ARRAY_I, ew_boundary_type, ns_boundary_type)
+
+!  This subroutine adds ghost cells to global primary grid lengths array
+!  ARRAY_I and outputs result to array ARRAY_O
+
+!  Note duplicate implementation of this subroutine in:
+!  cicecore/cicedynB/infrastructure/comm/mpi/ice_boundary.F90
+
+   use ice_constants, only: c0
+   use ice_domain_size, only: nx_global, ny_global
+
+   real (kind=dbl_kind), dimension(:,:), intent(in) :: &
+      ARRAY_I
+
+   character (*), intent(in) :: &
+      ew_boundary_type, ns_boundary_type
+
+   real (kind=dbl_kind), dimension(:,:), intent(out) :: &
+      ARRAY_O
+
+!-----------------------------------------------------------------------
+!
+!  local variables
+!
+!-----------------------------------------------------------------------
+
+   integer (kind=int_kind) :: &
+      ii, io, ji, jo
+
+   character(len=*), parameter :: &
+      subname = '(primary_grid_lengths_global_ext)'
+
+!-----------------------------------------------------------------------
+!
+!  add ghost cells to global primary grid lengths array
+!
+!-----------------------------------------------------------------------
+
+   if (trim(ns_boundary_type) == 'tripole' .or. &
+       trim(ns_boundary_type) == 'tripoleT') then
+      call abort_ice(subname//' ERROR: '//ns_boundary_type &
+         //' boundary type not implemented for configuration')
+   endif
+
+   do jo = 1,ny_global+2*nghost
+      ji = -nghost + jo
+
+      !*** Southern ghost cells
+
+      if (ji < 1) then
+         select case (trim(ns_boundary_type))
+         case ('cyclic')
+            ji = ji + ny_global
+         case ('open')
+            ji = nghost - jo + 1
+         case ('closed')
+            ji = 0
+         case default
+            call abort_ice( &
+               subname//' ERROR: unknown north-south boundary type')
+         end select
+      endif
+
+      !*** Northern ghost cells
+
+      if (ji > ny_global) then
+         select case (trim(ns_boundary_type))
+         case ('cyclic')
+            ji = ji - ny_global
+         case ('open')
+            ji = 2 * ny_global - ji + 1
+         case ('closed')
+            ji = 0
+         case default
+            call abort_ice( &
+               subname//' ERROR: unknown north-south boundary type')
+         end select
+      endif
+
+      do io = 1,nx_global+2*nghost
+         ii = -nghost + io
+
+         !*** Western ghost cells
+
+         if (ii < 1) then
+            select case (trim(ew_boundary_type))
+            case ('cyclic')
+               ii = ii + nx_global
+            case ('open')
+               ii = nghost - io + 1
+            case ('closed')
+               ii = 0
+            case default
+               call abort_ice( &
+                  subname//' ERROR: unknown east-west boundary type')
+            end select
+         endif
+
+         !*** Eastern ghost cells
+
+         if (ii > nx_global) then
+            select case (trim(ew_boundary_type))
+            case ('cyclic')
+               ii = ii - nx_global
+            case ('open')
+               ii = 2 * nx_global - ii + 1
+            case ('closed')
+               ii = 0
+            case default
+               call abort_ice( &
+                  subname//' ERROR: unknown east-west boundary type')
+            end select
+         endif
+
+         if (ii == 0 .or. ji == 0) then
+            ARRAY_O(io, jo) = c0
+         else
+            ARRAY_O(io, jo) = ARRAY_I(ii, ji)
+         endif
+
+      enddo
+   enddo
+
+!-----------------------------------------------------------------------
+
+ end subroutine primary_grid_lengths_global_ext
+
 !***********************************************************************
 
 end module ice_boundary
diff --git a/cicecore/cicedynB/infrastructure/comm/serial/ice_gather_scatter.F90 b/cicecore/cicedynB/infrastructure/comm/serial/ice_gather_scatter.F90
index 515cdfa65..4b0bb1f9e 100644
--- a/cicecore/cicedynB/infrastructure/comm/serial/ice_gather_scatter.F90
+++ b/cicecore/cicedynB/infrastructure/comm/serial/ice_gather_scatter.F90
@@ -687,7 +687,7 @@ subroutine gather_global_ext_log(ARRAY_G, ARRAY, dst_task, src_dist, spc_val)
    if (present(spc_val)) then
       special_value = spc_val
    else
-      special_value = .false. !MHRI:  true/false
+      special_value = .false.
    endif
    ARRAY_G = special_value
 
diff --git a/cicecore/cicedynB/infrastructure/ice_grid.F90 b/cicecore/cicedynB/infrastructure/ice_grid.F90
index 9ce22a6f4..18dbaaefe 100644
--- a/cicecore/cicedynB/infrastructure/ice_grid.F90
+++ b/cicecore/cicedynB/infrastructure/ice_grid.F90
@@ -19,7 +19,8 @@ module ice_grid
 
       use ice_kinds_mod
       use ice_broadcast, only: broadcast_scalar, broadcast_array
-      use ice_boundary, only: ice_HaloUpdate, ice_HaloExtrapolate
+      use ice_boundary, only: ice_HaloUpdate, ice_HaloExtrapolate, &
+          primary_grid_lengths_global_ext
       use ice_communicate, only: my_task, master_task
       use ice_blocks, only: block, get_block, nx_block, ny_block, nghost
       use ice_domain_size, only: nx_global, ny_global, max_blocks
@@ -77,6 +78,10 @@ module ice_grid
          ocn_gridcell_frac   ! only relevant for lat-lon grids
                              ! gridcell value of [1 - (land fraction)] (T-cell)
 
+      real (kind=dbl_kind), dimension (:,:), allocatable, public :: &
+         G_HTE  , & ! length of eastern edge of T-cell (global ext.)
+         G_HTN      ! length of northern edge of T-cell (global ext.)
+
       real (kind=dbl_kind), dimension (:,:,:), allocatable, public :: &
          cyp    , & ! 1.5*HTE(i,j)-0.5*HTW(i,j) = 1.5*HTE(i,j)-0.5*HTE(i-1,j) 
          cxp    , & ! 1.5*HTN(i,j)-0.5*HTS(i,j) = 1.5*HTN(i,j)-0.5*HTN(i,j-1)
@@ -125,7 +130,8 @@ module ice_grid
          kmt        ! ocean topography mask for bathymetry (T-cell)
 
       logical (kind=log_kind), public :: &
-         use_bathymetry     ! flag for reading in bathymetry_file
+         use_bathymetry, & ! flag for reading in bathymetry_file
+         pgl_global_ext    ! flag for init primary grid lengths (global ext.)
 
       logical (kind=log_kind), &
          dimension (:,:,:), allocatable, public :: &
@@ -153,6 +159,8 @@ subroutine alloc_grid
 
       integer (int_kind) :: ierr
 
+      character(len=*), parameter :: subname = '(alloc_grid)'
+
       allocate( &
          dxt      (nx_block,ny_block,max_blocks), & ! width of T-cell through the middle (m)
          dyt      (nx_block,ny_block,max_blocks), & ! height of T-cell through the middle (m)
@@ -203,7 +211,15 @@ subroutine alloc_grid
          mse  (2,2,nx_block,ny_block,max_blocks), &
          msw  (2,2,nx_block,ny_block,max_blocks), &
          stat=ierr)
-      if (ierr/=0) call abort_ice('(alloc_grid): Out of memory')
+      if (ierr/=0) call abort_ice(subname//'ERROR: Out of memory')
+
+      if (pgl_global_ext) then
+         allocate( &
+            G_HTE(nx_global+2*nghost, ny_global+2*nghost), & ! length of eastern edge of T-cell (global ext.)
+            G_HTN(nx_global+2*nghost, ny_global+2*nghost), & ! length of northern edge of T-cell (global ext.)
+            stat=ierr)
+         if (ierr/=0) call abort_ice(subname//'ERROR: Out of memory')
+      endif
 
       end subroutine alloc_grid
 
@@ -1499,6 +1515,10 @@ subroutine primary_grid_lengths_HTN(work_g)
       enddo
       enddo
       endif
+      if (pgl_global_ext) then
+         call primary_grid_lengths_global_ext( &
+            G_HTN, work_g, ew_boundary_type, ns_boundary_type)
+      endif
       call scatter_global(HTN, work_g, master_task, distrb_info, &
                           field_loc_Nface, field_type_scalar)
       call scatter_global(dxu, work_g2, master_task, distrb_info, &
@@ -1573,6 +1593,10 @@ subroutine primary_grid_lengths_HTE(work_g)
             enddo
          endif
       endif
+      if (pgl_global_ext) then
+         call primary_grid_lengths_global_ext( &
+            G_HTE, work_g, ew_boundary_type, ns_boundary_type)
+      endif
       call scatter_global(HTE, work_g, master_task, distrb_info, &
                           field_loc_Eface, field_type_scalar)
       call scatter_global(dyu, work_g2, master_task, distrb_info, &
diff --git a/configuration/scripts/ice_in b/configuration/scripts/ice_in
index e918a694c..f9631a91d 100644
--- a/configuration/scripts/ice_in
+++ b/configuration/scripts/ice_in
@@ -127,7 +127,7 @@
     kdyn            = 1
     ndte            = 240
     revised_evp     = .false.
-    kevp_kernel     = 0
+    evp_algorithm   = 'standard_2d'
     brlx            = 300.0
     arlx            = 300.0
     advection       = 'remap'
diff --git a/configuration/scripts/options/set_nml.alt04 b/configuration/scripts/options/set_nml.alt04
index 53372f124..98eb311cb 100644
--- a/configuration/scripts/options/set_nml.alt04
+++ b/configuration/scripts/options/set_nml.alt04
@@ -17,7 +17,7 @@ sw_frac = 0.9d0
 sw_dtemp = 0.02d0
 conduct = 'MU71'
 kdyn  =  1
-kevp_kernel = 102
+evp_algorithm = 'shared_mem_1d'
 fbot_xfer_type = 'Cdn_ocn'
 shortwave = 'dEdd'
 formdrag = .true.
diff --git a/configuration/scripts/options/set_nml.evp1d b/configuration/scripts/options/set_nml.evp1d
new file mode 100644
index 000000000..e7d38e86b
--- /dev/null
+++ b/configuration/scripts/options/set_nml.evp1d
@@ -0,0 +1 @@
+evp_algorithm = 'shared_mem_1d'
diff --git a/configuration/scripts/options/set_nml.kevp102 b/configuration/scripts/options/set_nml.kevp102
deleted file mode 100644
index 3a5dc3dbd..000000000
--- a/configuration/scripts/options/set_nml.kevp102
+++ /dev/null
@@ -1 +0,0 @@
-kevp_kernel = 102
diff --git a/configuration/scripts/tests/base_suite.ts b/configuration/scripts/tests/base_suite.ts
index 69252f9fb..78df9ac81 100644
--- a/configuration/scripts/tests/base_suite.ts
+++ b/configuration/scripts/tests/base_suite.ts
@@ -9,9 +9,9 @@ smoke          gx3     7x2        diag1,bigdiag,run1day,diagpt1
 decomp         gx3     4x2x25x29x5  none
 smoke          gx3     4x2        diag1,run5day             smoke_gx3_8x2_diag1_run5day
 smoke          gx3     4x1        diag1,run5day,thread      smoke_gx3_8x2_diag1_run5day
-restart        gx1    40x4        droundrobin
-restart        tx1    40x4        dsectrobin
-restart        tx1    60x2        droundrobin,maskhalo
+smoke          gx3     1x8        diag1,run5day,evp1d       smoke_gx3_8x2_diag1_run5day
+restart        gx1    40x4        droundrobin,medium
+restart        tx1    40x4        dsectrobin,medium
 restart        gx3     4x4        none
 restart        gx3    10x4        maskhalo
 restart        gx3     6x2        alt01
diff --git a/doc/source/developer_guide/dg_dynamics.rst b/doc/source/developer_guide/dg_dynamics.rst
index 47b54bde2..48dead1cb 100644
--- a/doc/source/developer_guide/dg_dynamics.rst
+++ b/doc/source/developer_guide/dg_dynamics.rst
@@ -6,7 +6,7 @@
 Dynamics
 ============================
 
-The CICE **cicecore/** directory consists of the non icepack source code.  Within that 
+The CICE **cicecore/** directory consists of the non icepack source code.  Within that
 directory there are the following subdirectories
 
 **cicecore/cicedynB/analysis** contains higher level history and diagnostic routines.
@@ -30,28 +30,19 @@ Dynamical Solvers
 --------------------
 
 The dynamics solvers are found in **cicecore/cicedynB/dynamics/**.  A couple of different solvers are
-available including EVP, revised EVP, EAP and VP.  The dynamics solver is specified in namelist with the
-``kdyn`` variable.  ``kdyn=1`` is evp, ``kdyn=2`` is eap, ``kdyn=3`` is VP and revised EVP requires 
-the ``revised_evp`` namelist flag be set to true.
-
-Multiple EVP solvers are supported thru the namelist flag ``kevp_kernel``.  The standard implementation
-and current default is ``kevp_kernel=0``.  In this case, the stress is solved on the regular decomposition
-via subcycling and calls to subroutine ``stress`` and subroutine ``stepu`` with MPI global sums required in each
-subcycling call.  With ``kevp_kernel=2``, the data required to compute the stress is gathered to the root
-MPI process and the stress calculation is performed on the root task without any MPI global sums.  OpenMP
-parallelism is supported in ``kevp_kernel=2``.  The solutions with ``kevp_kernel`` set to 0 or 2 will 
-not be bit-for-bit
-identical but should be the same to roundoff and produce the same climate.  ``kevp_kernel=2`` may perform
-better for some configurations, some machines, and some pe counts.  ``kevp_kernel=2`` is not supported
-with the tripole grid and is still being validated.  Until ``kevp_kernel=2`` is fully validated, it will
-abort if set.  To override the abort, use value 102 for testing.
+available including EVP, EAP and VP.  The dynamics solver is specified in namelist with the
+``kdyn`` variable.  ``kdyn=1`` is evp, ``kdyn=2`` is eap, ``kdyn=3`` is VP.
+
+Two alternative implementations of EVP are included. The first alternative is the Revised EVP, triggered when the ``revised_evp`` is set to true. The second alternative is the 1d EVP solver triggered when the ``evp_algorithm`` is set to ``shared_mem_1d`` as oppose to the default setting of ``evp_standard_2d``. The solutions with ``evp_algorithm`` set to ``standard_2d`` or ``shared_mem_1d`` will
+not be bit-for-bit identical when compared to each other. The reason for this is floating point round off errors that occur unless strict compiler flags are used. ``evp_algorithm=shared_mem_1d`` is primarily built for OpenMP. If MPI domain splitting is used then the solver will only run on the master processor. ``evp_algorithm=shared_mem_1d`` is not supported
+with the tripole grid.
 
 
 Transport
 -----------------
 
-The transport (advection) methods are found in **cicecore/cicedynB/dynamics/**.  Two methods are supported, 
-upwind and remap.  These are set in namelist via the ``advection`` variable.  
+The transport (advection) methods are found in **cicecore/cicedynB/dynamics/**.  Two methods are supported,
+upwind and remap.  These are set in namelist via the ``advection`` variable.
 Transport can be disabled with the ``ktransport`` namelist variable.
 
 
@@ -90,7 +81,7 @@ Time Manager
 
 Time manager data is module data in **cicecore/shared/ice_calendar.F90**.  Much of the time manager
 data is public and operated on during the model timestepping.  The model timestepping actually takes
-place in the **CICE_RunMod.F90** file which is part of the driver code.  
+place in the **CICE_RunMod.F90** file which is part of the driver code.
 
 The time manager was updated in early 2021.  Additional information about the time manager can be found here, :ref:`timemanagerplus`
 
@@ -100,12 +91,12 @@ Communication
 ------------------
 
 Two low-level communications packages, mpi and serial, are provided as part of CICE.  This software
-provides a middle layer between the model and the underlying libraries.  Only the CICE mpi or 
+provides a middle layer between the model and the underlying libraries.  Only the CICE mpi or
 serial directories are compiled with CICE, not both.
 
-**cicedynB/infrastructure/comm/mpi/** 
+**cicedynB/infrastructure/comm/mpi/**
 is based on MPI and provides various methods to do halo updates, global sums, gather/scatter, broadcasts
-and similar using some fairly generic interfaces to isolate the MPI calls in the code.  
+and similar using some fairly generic interfaces to isolate the MPI calls in the code.
 
 **cicedynB/infrastructure/comm/serial/** support the same interfaces, but operates
 in shared memory mode with no MPI.  The serial library will be used, by default in the CICE scripts,
@@ -124,7 +115,7 @@ case.  This has to be set before CICE is built.
 **cicedynB/infrastructure/io/io_netcdf/** is the
 default for the standalone CICE model, and it supports writing history and restart files in netcdf
 format using standard netcdf calls.  It does this by writing from and reading to the root task and
-gathering and scattering fields from the root task to support model parallelism.  
+gathering and scattering fields from the root task to support model parallelism.
 
 **cicedynB/infrastructure/io/io_binary/** supports files in binary format using a gather/scatter
 approach and reading to and writing from the root task.
@@ -134,4 +125,3 @@ is a parallel io library (https://github.com/NCAR/ParallelIO) that supports read
 binary and netcdf file through various interfaces including netcdf and pnetcdf.  pio is generally
 more parallel in memory even when using serial netcdf than the standard gather/scatter methods,
 and it provides parallel read/write capabilities by optionally linking and using pnetcdf.
-
diff --git a/doc/source/developer_guide/dg_forcing.rst b/doc/source/developer_guide/dg_forcing.rst
index 0c0380538..aea6d8ef6 100644
--- a/doc/source/developer_guide/dg_forcing.rst
+++ b/doc/source/developer_guide/dg_forcing.rst
@@ -180,7 +180,7 @@ constant thereafter.  Different conditions can be specified thru the
 .. _box2001forcing:
 
 Box2001 Atmosphere Forcing
--------------------------
+---------------------------
 
 The box2001 forcing dataset in generated internally.  No files are read.  The
 dataset is used to test an idealized box case as defined in :cite:`Hunke01`.
diff --git a/doc/source/science_guide/sg_dynamics.rst b/doc/source/science_guide/sg_dynamics.rst
index ecef531b4..d4e209d8a 100644
--- a/doc/source/science_guide/sg_dynamics.rst
+++ b/doc/source/science_guide/sg_dynamics.rst
@@ -6,10 +6,9 @@ Dynamics
 ========
 
 There are different approaches in the CICE code for representing sea ice
-rheology and for solving the sea ice momentum equation. The
-elastic-viscous-plastic (EVP) model represents a modification of the
-standard viscous-plastic (VP) model for sea ice dynamics
-:cite:`Hibler79`. The elastic-anisotropic-plastic (EAP) model,
+rheology and for solving the sea ice momentum equation. The viscous-plastic (VP) originally developed by :cite:`Hibler79`,
+the elastic-viscous-plastic (EVP) :cite:`Hunke97` model represents a modification of the
+standard viscous-plastic (VP) model for sea ice dynamics. The elastic-anisotropic-plastic (EAP) model,
 on the other hand, explicitly accounts for the observed sub-continuum
 anisotropy of the sea ice cover :cite:`Wilchinsky06,Weiss09`. If
 ``kdyn`` = 1 in the namelist then the EVP model is used (module
@@ -68,7 +67,7 @@ vertical direction:
 
 where :math:`m` is the combined mass of ice and snow per unit area and
 :math:`\vec{\tau}_a` and :math:`\vec{\tau}_w` are wind and ocean
-stresses, respectively. The term :math:`\vec{\tau}_b` is a 
+stresses, respectively. The term :math:`\vec{\tau}_b` is a
 seabed stress (also referred to as basal stress) that represents the grounding of pressure
 ridges in shallow water :cite:`Lemieux16`. The mechanical properties of the ice are represented by the
 internal stress tensor :math:`\sigma_{ij}`. The other two terms on
@@ -84,11 +83,11 @@ For clarity, the two components of Equation :eq:`vpmom` are
 
 .. math::
    \begin{aligned}
-   m{\partial u\over\partial t} &= {\partial\sigma_{1j}\over\partial x_j} + \tau_{ax} + 
+   m{\partial u\over\partial t} &= {\partial\sigma_{1j}\over\partial x_j} + \tau_{ax} +
      a_i c_w \rho_w
      \left|{\bf U}_w - {\bf u}\right| \left[\left(U_w-u\right)\cos\theta - \left(V_w-v\right)\sin\theta\right]
      -C_bu +mfv - mg{\partial H_\circ\over\partial x}, \\
-   m{\partial v\over\partial t} &= {\partial\sigma_{2j}\over\partial x_j} + \tau_{ay} + 
+   m{\partial v\over\partial t} &= {\partial\sigma_{2j}\over\partial x_j} + \tau_{ay} +
      a_i c_w \rho_w
      \left|{\bf U}_w - {\bf u}\right| \left[\left(U_w-u\right)\sin\theta + \left(V_w-v\right)\cos\theta\right]
      -C_bv-mfu - mg{\partial H_\circ\over\partial y}. \end{aligned}
@@ -111,14 +110,14 @@ Elastic-Viscous-Plastic
 The momentum equation is discretized in time as follows, for the classic
 EVP approach.
 In the code,
-:math:`{\tt vrel}=a_i c_w \rho_w\left|{\bf U}_w - {\bf u}^k\right|` and 
-:math:`C_b=T_b \left( \sqrt{(u^k)^2+(v^k)^2}+u_0 \right)^{-1}`, 
+:math:`{\tt vrel}=a_i c_w \rho_w\left|{\bf U}_w - {\bf u}^k\right|` and
+:math:`C_b=T_b \left( \sqrt{(u^k)^2+(v^k)^2}+u_0 \right)^{-1}`,
 where :math:`k` denotes the subcycling step. The following equations
 illustrate the time discretization and define some of the other
 variables used in the code.
 
 .. math::
-   \underbrace{\left({m\over\Delta t_e}+{\tt vrel} \cos\theta\ + C_b \right)}_{\tt cca} u^{k+1} 
+   \underbrace{\left({m\over\Delta t_e}+{\tt vrel} \cos\theta\ + C_b \right)}_{\tt cca} u^{k+1}
    - \underbrace{\left(mf+{\tt vrel}\sin\theta\right)}_{\tt ccb}v^{k+1}
     =  &\underbrace{{\partial\sigma_{1j}^{k+1}\over\partial x_j}}_{\tt strintx}
     + \underbrace{\tau_{ax} - mg{\partial H_\circ\over\partial x} }_{\tt forcex} \\
@@ -126,7 +125,7 @@ variables used in the code.
    :label: umom
 
 .. math::
-    \underbrace{\left(mf+{\tt vrel}\sin\theta\right)}_{\tt ccb} u^{k+1} 
+    \underbrace{\left(mf+{\tt vrel}\sin\theta\right)}_{\tt ccb} u^{k+1}
    + \underbrace{\left({m\over\Delta t_e}+{\tt vrel} \cos\theta + C_b \right)}_{\tt cca}v^{k+1}
     =  &\underbrace{{\partial\sigma_{2j}^{k+1}\over\partial x_j}}_{\tt strinty}
     + \underbrace{\tau_{ay} - mg{\partial H_\circ\over\partial y} }_{\tt forcey} \\
@@ -139,7 +138,7 @@ We solve this system of equations analytically for :math:`u^{k+1}` and
 :math:`v^{k+1}`. Define
 
 .. math::
-   \hat{u} = F_u + \tau_{ax} - mg{\partial H_\circ\over\partial x} + {\tt vrel} \left(U_w\cos\theta - V_w\sin\theta\right) + {m\over\Delta t_e}u^k 
+   \hat{u} = F_u + \tau_{ax} - mg{\partial H_\circ\over\partial x} + {\tt vrel} \left(U_w\cos\theta - V_w\sin\theta\right) + {m\over\Delta t_e}u^k
    :label: cevpuhat
 
 .. math::
@@ -169,7 +168,7 @@ where
 .. math::
    b = mf + {\tt vrel}\sin\theta.
    :label: cevpb
-   
+
 .. _vp-momentum:
 
 Viscous-Plastic
@@ -248,52 +247,52 @@ stress are expressed as :math:`\tau_{bx}=C_bu` and
 coefficient.
 
 The two parameterizations differ in their calculation of
-the :math:`C_b` coefficients. 
+the :math:`C_b` coefficients.
 
 Note that the user must provide a bathymetry field for using these
 grounding schemes. It is suggested to have a bathymetry field with water depths
 larger than 5 m that represents well shallow water (less than 30 m) regions such as the Laptev Sea
-and the East Siberian Sea.   
+and the East Siberian Sea.
 
 Seabed stress based on linear keel draft (LKD)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This parameterization for the seabed stress is described in
 :cite:`Lemieux16`. It assumes that the largest keel draft varies linearly with the mean thickness in a grid cell (i.e. sea ice volume). The :math:`C_b` coefficients are expressed as
 
 .. math::
    C_b= k_2 \max [0,(h_u - h_{cu})]  e^{-\alpha_b * (1 - a_u)} (\sqrt{u^2+v^2}+u_0)^{-1}, \\
-   :label: Cb 
+   :label: Cb
 
-where :math:`k_2` determines the maximum seabed stress that can be sustained by the grounded parameterized ridge(s), :math:`u_0` 
-is a small residual velocity and :math:`\alpha_b` is a parameter to ensure that the seabed stress quickly drops when 
-the ice concentration is smaller than 1. In the code, :math:`k_2 \max [0,(h_u - h_{cu})]  e^{-\alpha_b * (1 - a_u)}` is defined as 
-:math:`T_b`. The quantities :math:`h_u`, :math:`a_{u}` and :math:`h_{cu}` are calculated at 
-the 'u' point based on local ice conditions (surrounding tracer points). They are respectively given by 
+where :math:`k_2` determines the maximum seabed stress that can be sustained by the grounded parameterized ridge(s), :math:`u_0`
+is a small residual velocity and :math:`\alpha_b` is a parameter to ensure that the seabed stress quickly drops when
+the ice concentration is smaller than 1. In the code, :math:`k_2 \max [0,(h_u - h_{cu})]  e^{-\alpha_b * (1 - a_u)}` is defined as
+:math:`T_b`. The quantities :math:`h_u`, :math:`a_{u}` and :math:`h_{cu}` are calculated at
+the 'u' point based on local ice conditions (surrounding tracer points). They are respectively given by
 
 .. math::
    h_u=\max[v_i(i,j),v_i(i+1,j),v_i(i,j+1),v_i(i+1,j+1)], \\
-   :label: hu 
-   
+   :label: hu
+
 .. math::
    a_u=\max[a_i(i,j),a_i(i+1,j),a_i(i,j+1),a_i(i+1,j+1)]. \\
-   :label: au      
-   
+   :label: au
+
 .. math::
    h_{cu}=a_u h_{wu} / k_1, \\
    :label: hcu
 
-where the :math:`a_i` and :math:`v_i` are the total ice concentrations and ice volumes around the :math:`u` point :math:`i,j` and 
-:math:`k_1` is a parameter that defines the critical ice thickness :math:`h_{cu}` at which the parameterized 
-ridge(s) reaches the seafloor for a water depth :math:`h_{wu}=\min[h_w(i,j),h_w(i+1,j),h_w(i,j+1),h_w(i+1,j+1)]`. Given the formulation of :math:`C_b` in equation :eq:`Cb`, the seabed stress components are non-zero only 
-when :math:`h_u > h_{cu}`. 
+where the :math:`a_i` and :math:`v_i` are the total ice concentrations and ice volumes around the :math:`u` point :math:`i,j` and
+:math:`k_1` is a parameter that defines the critical ice thickness :math:`h_{cu}` at which the parameterized
+ridge(s) reaches the seafloor for a water depth :math:`h_{wu}=\min[h_w(i,j),h_w(i+1,j),h_w(i,j+1),h_w(i+1,j+1)]`. Given the formulation of :math:`C_b` in equation :eq:`Cb`, the seabed stress components are non-zero only
+when :math:`h_u > h_{cu}`.
 
-The maximum seabed stress depends on the weight of the ridge 
-above hydrostatic balance and the value of :math:`k_2`. It is, however, the parameter :math:`k_1` that has the most notable impact on the simulated extent of landfast ice. 
-The value of :math:`k_1` can be changed at runtime using the namelist variable ``k1``. 
+The maximum seabed stress depends on the weight of the ridge
+above hydrostatic balance and the value of :math:`k_2`. It is, however, the parameter :math:`k_1` that has the most notable impact on the simulated extent of landfast ice.
+The value of :math:`k_1` can be changed at runtime using the namelist variable ``k1``.
 
-To prevent unrealistic grounding, :math:`T_b` is set to zero when :math:`h_{wu}` 
-is larger than 30 m. This maximum value is chosen based on observations of large 
+To prevent unrealistic grounding, :math:`T_b` is set to zero when :math:`h_{wu}`
+is larger than 30 m. This maximum value is chosen based on observations of large
 keels in the Arctic Ocean :cite:`Amundrud04`.
 
 Seabed stress based on probabilistic approach
@@ -304,11 +303,11 @@ on the probability of contact between the ice thickness distribution
 (ITD) and the seabed. Multi-thickness category models such as CICE typically use a
 few thickness categories (5-10). This crude representation of the ITD
 does not resolve the tail of the ITD, which is crucial for grounding
-events. 
+events.
 
 To represent the tail of the distribution, the simulated ITD is
 converted to a positively skewed probability function :math:`f(x)`
-with :math:`x` the sea ice thickness. The mean and variance are set 
+with :math:`x` the sea ice thickness. The mean and variance are set
 equal to the ones of the original ITD. A
 log-normal distribution is used for :math:`f(x)`.
 
@@ -317,7 +316,7 @@ distribution :math:`b(y)`. The mean of :math:`b(y)` comes from the user's bathym
 standard deviation :math:`\sigma_b` is currently fixed to 2.5 m. Two
 possible improvements would be to specify a distribution based on high
 resolution bathymetry data and to take into account variations of the
-water depth due to changes in the sea surface height. 
+water depth due to changes in the sea surface height.
 
 Assuming hydrostatic balance and neglecting the impact of snow, the draft of floating ice of thickness
 :math:`x` is :math:`D(x)=\rho_i x / \rho_w` where :math:`\rho_i` is the sea ice density. Hence, the probability of contact (:math:`P_c`) between the
@@ -337,7 +336,7 @@ and then obtains :math:`T_{bt}` by multiplying :math:`T_{bt}^*` by :math:`e^{-\a
 
 To calculate :math:`T_{bt}^*` in equation :eq:`Tbt`, :math:`f(x)` and :math:`b(y)` are discretized using many small categories (100). :math:`f(x)` is discretized between 0 and 50 m while :math:`b(y)` is truncated at plus and minus three :math:`\sigma_b`. :math:`f(x)` is also modified by setting it to	zero after a certain percentile of the log-normal distribution. This percentile, which is currently set to 99.7%, notably affects the simulation of landfast ice and is used as a tuning parameter. Its impact is similar to the one of the parameter :math:`k_1` for the LKD method.
 
-:math:`T_b` at the 'u' point is calculated from the 't' point values around it according to 
+:math:`T_b` at the 'u' point is calculated from the 't' point values around it according to
 
 .. math::
    T_b=\max[T_{bt}(i,j),T_{bt}(i+1,j),T_{bt}(i,j+1),T_{bt}(i+1,j+1)]. \\
@@ -362,13 +361,13 @@ divergence, :math:`D_D`, and the horizontal tension and shearing
 strain rates, :math:`D_T` and :math:`D_S` respectively:
 
 .. math::
-   D_D = \dot{\epsilon}_{11} + \dot{\epsilon}_{22}, 
+   D_D = \dot{\epsilon}_{11} + \dot{\epsilon}_{22},
 
 .. math::
-   D_T = \dot{\epsilon}_{11} - \dot{\epsilon}_{22}, 
+   D_T = \dot{\epsilon}_{11} - \dot{\epsilon}_{22},
 
 .. math::
-   D_S = 2\dot{\epsilon}_{12}, 
+   D_S = 2\dot{\epsilon}_{12},
 
 where
 
@@ -376,12 +375,12 @@ where
    \dot{\epsilon}_{ij} = {1\over 2}\left({{\partial u_i}\over{\partial x_j}} + {{\partial u_j}\over{\partial x_i}}\right)
 
 CICE can output the internal ice pressure which is an important field to support navigation in ice-infested water.
-The internal ice pressure (``sigP``) is the average of the normal stresses multiplied by :math:`-1` and 
+The internal ice pressure (``sigP``) is the average of the normal stresses multiplied by :math:`-1` and
 is therefore simply equal to :math:`-\sigma_1/2`.
 
-Following the approach of :cite:`Konig10` (see also :cite:`Lemieux16`), the 
-elliptical yield curve can be modified such that the ice has isotropic tensile strength. 
-The tensile strength :math:`T_p` is expressed as a fraction of the ice strength :math:`P`, that is :math:`T_p=k_t P` 
+Following the approach of :cite:`Konig10` (see also :cite:`Lemieux16`), the
+elliptical yield curve can be modified such that the ice has isotropic tensile strength.
+The tensile strength :math:`T_p` is expressed as a fraction of the ice strength :math:`P`, that is :math:`T_p=k_t P`
 where :math:`k_t` should be set to a value between 0 and 1 (this can
 be changed at runtime with the namelist parameter ``Ktens``). The ice
 strength :math:`P` is a function of the ice thickness distribution as
@@ -403,10 +402,10 @@ where :math:`\eta` and :math:`\zeta` are the bulk and shear viscosities.
 An elliptical yield curve is used, with the viscosities given by
 
 .. math::
-   \zeta = {P(1+k_t)\over 2\Delta}, 
+   \zeta = {P(1+k_t)\over 2\Delta},
 
 .. math::
-   \eta  = {P(1+k_t)\over {2\Delta e^2}}, 
+   \eta  = {P(1+k_t)\over {2\Delta e^2}},
 
 where
 
@@ -447,7 +446,7 @@ dynamics component is subcycled within the time step, and the elastic
 parameter :math:`E` is defined in terms of a damping timescale :math:`T`
 for elastic waves, :math:`\Delta t_e < T < \Delta t`, as
 
-.. math:: 
+.. math::
    E = {\zeta\over T},
 
 where :math:`T=E_\circ\Delta t` and :math:`E_\circ` (eyc) is a tunable
@@ -455,7 +454,7 @@ parameter less than one. Including the modification proposed by :cite:`Bouillon1
 
 .. math::
    \begin{aligned}
-   {\partial\sigma_1\over\partial t} + {\sigma_1\over 2T} 
+   {\partial\sigma_1\over\partial t} + {\sigma_1\over 2T}
      + {P_R(1-k_t)\over 2T} &=& {P(1+k_t)\over 2T\Delta} D_D, \\
    {\partial\sigma_2\over\partial t} + {\sigma_2\over 2T} &=& {P(1+k_t)\over
      2Te^2\Delta} D_T,\\
@@ -466,14 +465,14 @@ Once discretized in time, these last three equations are written as
 
 .. math::
    \begin{aligned}
-   {(\sigma_1^{k+1}-\sigma_1^{k})\over\Delta t_e} + {\sigma_1^{k+1}\over 2T} 
+   {(\sigma_1^{k+1}-\sigma_1^{k})\over\Delta t_e} + {\sigma_1^{k+1}\over 2T}
      + {P_R^k(1-k_t)\over 2T} &=& {P(1+k_t)\over 2T\Delta^k} D_D^k, \\
    {(\sigma_2^{k+1}-\sigma_2^{k})\over\Delta t_e} + {\sigma_2^{k+1}\over 2T} &=& {P(1+k_t)\over
      2Te^2\Delta^k} D_T^k,\\
    {(\sigma_{12}^{k+1}-\sigma_{12}^{k})\over\Delta t_e} + {\sigma_{12}^{k+1}\over  2T} &=&
      {P(1+k_t)\over 4Te^2\Delta^k}D_S^k,\end{aligned}
-   :label: sigdisc  
-     
+   :label: sigdisc
+
 
 where :math:`k` denotes again the subcycling step. All coefficients on the left-hand side are constant except for
 :math:`P_R`. This modification compensates for the decreased efficiency of including
@@ -498,7 +497,7 @@ anisotropy of the sea ice cover is accounted for by an additional
 prognostic variable, the structure tensor :math:`\mathbf{A}` defined
 by
 
-.. math:: 
+.. math::
    {\mathbf A}=\int_{\mathbb{S}}\vartheta(\mathbf r)\mathbf r\mathbf r d\mathbf r\label{structuretensor}.
 
 where :math:`\mathbb{S}` is a unit-radius circle; **A** is a unit
@@ -517,7 +516,7 @@ components of :math:`\mathbf{A}`, :math:`A_{1}/A_{2}`, are derived
 from the phenomenological evolution equation for the structure tensor
 :math:`\mathbf A`,
 
-.. math:: 
+.. math::
    \frac{D\mathbf{A}}{D t}=\mathbf{F}_{iso}(\mathbf{A})+\mathbf{F}_{frac}(\mathbf{A},\boldsymbol\sigma),
    :label: evolutionA
 
@@ -581,7 +580,7 @@ of two equations:
 
 .. math::
    \begin{aligned}
-   \frac{\partial A_{11}}{\partial t}&=&-k_{t}\left(A_{11}-\frac{1}{2}\right)+M_{11}  \mbox{,} \\ 
+   \frac{\partial A_{11}}{\partial t}&=&-k_{t}\left(A_{11}-\frac{1}{2}\right)+M_{11}  \mbox{,} \\
    \frac{\partial A_{12}}{\partial t}&=&-k_{t} A_{12}+M_{12}  \mbox{,}\end{aligned}
 
 where the first terms on the right hand side correspond to the
@@ -618,7 +617,7 @@ but in a continuum-scale sea ice region the floes can possess different
 orientations in different places and we take the mean sea ice stress
 over a collection of floes to be given by the average
 
-.. math:: 
+.. math::
    \boldsymbol\sigma^{EAP}(h)=P_{r}(h)\int_{\mathbb{S}}\vartheta(\mathbf r)\left[\boldsymbol\sigma_{r}^{b}(\mathbf r)+ k \boldsymbol\sigma_{s}^{b}(\mathbf r)\right]d\mathbf r
    :label: stressaverage
 
@@ -633,11 +632,11 @@ efficient, explicit numerical algorithm used to solve the full sea ice
 momentum balance. We use the analogous EAP stress equations,
 
 .. math::
-   \frac{\partial \sigma_{1}}{\partial t}+\frac{\sigma_1}{2T} = \frac{\sigma^{EAP}_{1}}{2T}  \mbox{,}  
+   \frac{\partial \sigma_{1}}{\partial t}+\frac{\sigma_1}{2T} = \frac{\sigma^{EAP}_{1}}{2T}  \mbox{,}
    :label: EAPsigma1
 
 .. math::
-   \frac{\partial \sigma_{2}}{\partial t}+\frac{\sigma_2}{2T} = \frac{\sigma^{EAP}_{2}}{2T} \mbox{,}  
+   \frac{\partial \sigma_{2}}{\partial t}+\frac{\sigma_2}{2T} = \frac{\sigma^{EAP}_{2}}{2T} \mbox{,}
    :label: EAPsigma2
 
 .. math::
@@ -676,44 +675,44 @@ of the dynamics.
 Revised approach
 ****************
 
-The revised EVP approach is based on a pseudo-time iterative scheme :cite:`Lemieux12`, :cite:`Bouillon13`, :cite:`Kimmritz15`. By construction, the revised EVP approach should lead to the VP solution 
-(given the right numerical parameters and a sufficiently large number of iterations). To do so, the inertial term is formulated such that it matches the backward Euler approach of 
-implicit solvers and there is an additional term for the pseudo-time iteration. Hence, with the revised approach, the discretized momentum equations :eq:`umom` and :eq:`vmom` become  
+The revised EVP approach is based on a pseudo-time iterative scheme :cite:`Lemieux12`, :cite:`Bouillon13`, :cite:`Kimmritz15`. By construction, the revised EVP approach should lead to the VP solution
+(given the right numerical parameters and a sufficiently large number of iterations). To do so, the inertial term is formulated such that it matches the backward Euler approach of
+implicit solvers and there is an additional term for the pseudo-time iteration. Hence, with the revised approach, the discretized momentum equations :eq:`umom` and :eq:`vmom` become
 
 .. math::
-    {\beta^*(u^{k+1}-u^k)\over\Delta t_e} + {m(u^{k+1}-u^n)\over\Delta t} + {\left({\tt vrel} \cos\theta + C_b \right)} u^{k+1} 
+    {\beta^*(u^{k+1}-u^k)\over\Delta t_e} + {m(u^{k+1}-u^n)\over\Delta t} + {\left({\tt vrel} \cos\theta + C_b \right)} u^{k+1}
     - {\left(mf+{\tt vrel}\sin\theta\right)} v^{k+1}
-    = {{\partial\sigma_{1j}^{k+1}\over\partial x_j}} 
+    = {{\partial\sigma_{1j}^{k+1}\over\partial x_j}}
     + {\tau_{ax} - mg{\partial H_\circ\over\partial x} }
     + {\tt vrel} {\left(U_w\cos\theta-V_w\sin\theta\right)},
     :label: umomr
 
 .. math::
-    {\beta^*(v^{k+1}-v^k)\over\Delta t_e} + {m(v^{k+1}-v^n)\over\Delta t} + {\left({\tt vrel} \cos\theta + C_b \right)}v^{k+1} 
-    + {\left(mf+{\tt vrel}\sin\theta\right)} u^{k+1} 
-    = {{\partial\sigma_{2j}^{k+1}\over\partial x_j}} 
+    {\beta^*(v^{k+1}-v^k)\over\Delta t_e} + {m(v^{k+1}-v^n)\over\Delta t} + {\left({\tt vrel} \cos\theta + C_b \right)}v^{k+1}
+    + {\left(mf+{\tt vrel}\sin\theta\right)} u^{k+1}
+    = {{\partial\sigma_{2j}^{k+1}\over\partial x_j}}
     + {\tau_{ay} - mg{\partial H_\circ\over\partial y} }
     + {\tt vrel}{\left(U_w\sin\theta+V_w\cos\theta\right)},
     :label: vmomr
 
-where :math:`\beta^*` is a numerical parameter and :math:`u^n, v^n` are the components of the previous time level solution. 
+where :math:`\beta^*` is a numerical parameter and :math:`u^n, v^n` are the components of the previous time level solution.
 With :math:`\beta=\beta^* \Delta t \left(  m \Delta t_e \right)^{-1}` :cite:`Bouillon13`, these equations can be written as
- 
+
 .. math::
-   \underbrace{\left((\beta+1){m\over\Delta t}+{\tt vrel} \cos\theta\ + C_b \right)}_{\tt cca} u^{k+1} 
+   \underbrace{\left((\beta+1){m\over\Delta t}+{\tt vrel} \cos\theta\ + C_b \right)}_{\tt cca} u^{k+1}
    - \underbrace{\left(mf+{\tt vrel}\sin\theta\right)}_{\tt ccb}v^{k+1}
-    =  \underbrace{{\partial\sigma_{1j}^{k+1}\over\partial x_j}}_{\tt strintx} 
+    =  \underbrace{{\partial\sigma_{1j}^{k+1}\over\partial x_j}}_{\tt strintx}
     + \underbrace{\tau_{ax} - mg{\partial H_\circ\over\partial x} }_{\tt forcex}
      + {\tt vrel}\underbrace{\left(U_w\cos\theta-V_w\sin\theta\right)}_{\tt waterx}  + {m\over\Delta t}(\beta u^k + u^n),
    :label: umomr2
 
 .. math::
-    \underbrace{\left(mf+{\tt vrel}\sin\theta\right)}_{\tt ccb} u^{k+1} 
+    \underbrace{\left(mf+{\tt vrel}\sin\theta\right)}_{\tt ccb} u^{k+1}
    + \underbrace{\left((\beta+1){m\over\Delta t}+{\tt vrel} \cos\theta + C_b \right)}_{\tt cca}v^{k+1}
-    =  \underbrace{{\partial\sigma_{2j}^{k+1}\over\partial x_j}}_{\tt strinty} 
+    =  \underbrace{{\partial\sigma_{2j}^{k+1}\over\partial x_j}}_{\tt strinty}
     + \underbrace{\tau_{ay} - mg{\partial H_\circ\over\partial y} }_{\tt forcey}
      + {\tt vrel}\underbrace{\left(U_w\sin\theta+V_w\cos\theta\right)}_{\tt watery}  + {m\over\Delta t}(\beta v^k + v^n),
-   :label: vmomr2  
+   :label: vmomr2
 
 At this point, the solutions :math:`u^{k+1}` and :math:`v^{k+1}` are obtained in the same manner as for the standard EVP approach (see equations :eq:`cevpuhat` to :eq:`cevpb`).
 
@@ -721,16 +720,26 @@ Introducing another numerical parameter :math:`\alpha=2T \Delta t_e ^{-1}` :cite
 
 .. math::
    \begin{aligned}
-   {\alpha (\sigma_1^{k+1}-\sigma_1^{k})} + {\sigma_1^{k}} 
+   {\alpha (\sigma_1^{k+1}-\sigma_1^{k})} + {\sigma_1^{k}}
      + {P_R^k(1-k_t)} &=& {P(1+k_t)\over \Delta^k} D_D^k, \\
    {\alpha (\sigma_2^{k+1}-\sigma_2^{k})} + {\sigma_2^{k}} &=& {P(1+k_t)\over
      e^2\Delta^k} D_T^k,\\
    {\alpha (\sigma_{12}^{k+1}-\sigma_{12}^{k})} + {\sigma_{12}^{k}} &=&
      {P(1+k_t)\over 2e^2\Delta^k}D_S^k,\end{aligned}
-   
-where as opposed to the classic EVP, the second term in each equation is at iteration :math:`k` :cite:`Bouillon13`. Also, as opposed to the classic EVP, 
-:math:`\Delta t_e` times the number of subcycles (or iterations) does not need to be equal to the advective time step :math:`\Delta t`. 
-Finally, as with the classic EVP approach, the stresses are initialized using the previous time level values. 
-The revised EVP is activated by setting the namelist parameter ``revised_evp = true``. 
-In the code :math:`\alpha` is ``arlx`` and :math:`\beta` is ``brlx``. The values of ``arlx`` and ``brlx`` can be set in the namelist. 
+
+where as opposed to the classic EVP, the second term in each equation is at iteration :math:`k` :cite:`Bouillon13`. Also, as opposed to the classic EVP,
+:math:`\Delta t_e` times the number of subcycles (or iterations) does not need to be equal to the advective time step :math:`\Delta t`.
+Finally, as with the classic EVP approach, the stresses are initialized using the previous time level values.
+The revised EVP is activated by setting the namelist parameter ``revised_evp = true``.
+In the code :math:`\alpha` is ``arlx`` and :math:`\beta` is ``brlx``. The values of ``arlx`` and ``brlx`` can be set in the namelist.
 It is recommended to use large values of these parameters and to set :math:`\alpha=\beta` :cite:`Kimmritz15`.
+
+.. _evp1d:
+
+****************
+1d EVP solver
+****************
+
+The standard EVP solver iterates hundreds of times, where each iteration includes a communication through MPI and a limited number of calculations. This limits how much the solver can be optimized as the speed is primarily determined by the communication. The 1d EVP solver avoids the communication by utilizing shared memory, which removes the requirement for calls to the MPI communicator. As a consequence of this the potential scalability of the code is improved. The performance is best on shared memory but the solver is also functional on MPI and hybrid MPI/OpenMP setups as it will run on the master processor alone.
+
+The scalability of geophysical models is in general terms limited by the memory usage. In order to optimize this the 1d EVP solver solves the same equations that are outlined in the section :ref:`stress-evp` but it transforms all matrices to vectors (1d matrices) as this compiles better with the computer hardware. The vectorization and the contiguous placement of arrays in the memory makes it easier for the compiler to optimize the code and pass pointers instead of copying the vectors. The 1d solver is not supported for tripole grids and the code will abort if this combination is attempted.
diff --git a/doc/source/user_guide/ug_case_settings.rst b/doc/source/user_guide/ug_case_settings.rst
index 562a5bc81..b1f6b4614 100644
--- a/doc/source/user_guide/ug_case_settings.rst
+++ b/doc/source/user_guide/ug_case_settings.rst
@@ -395,8 +395,8 @@ dynamics_nml
    "", "``1``", "EVP dynamics", ""
    "", "``2``", "EAP dynamics", ""
    "", "``3``", "VP dynamics", ""
-   "``kevp_kernel``", "``0``", "standard 2D EVP memory parallel solver", "0"
-   "", "``2``", "1D shared memory solver (not fully validated)", ""
+   "``evp_algorithm``", "``standard_2d``", "standard 2d EVP memory parallel solver", "standard_2d"
+   "", "``shared_mem_1d``", "1d shared memory solver", ""
    "``kstrength``", "``0``", "ice strength formulation :cite:`Hibler79`", "1"
    "", "``1``", "ice strength formulation :cite:`Rothrock75`", ""
    "``krdg_partic``", "``0``", "old ridging participation function", "1"