diff --git a/src/enkf/enkf.f90 b/src/enkf/enkf.f90
index 8ea45a5b67..adf9921e9d 100644
--- a/src/enkf/enkf.f90
+++ b/src/enkf/enkf.f90
@@ -123,7 +123,7 @@ module enkf
                   obtype, oberrvarmean, numobspersat, deltapredx, biaspreds,&
                   oberrvar_orig, probgrosserr, prpgerr,&
                   corrlengthsq,lnsigl,obtimel,obloclat,obloclon,obpress,stattype,&
-                  anal_ob
+                  anal_ob,anal_ob_post
 use constants, only: pi, one, zero
 use params, only: sprd_tol, paoverpb_thresh, datapath, nanals,&
                   iassim_order,sortinc,deterministic,numiter,nlevs,&
@@ -831,19 +831,19 @@ subroutine enkf_update()
       call mpi_send(anal_obchunk,numobsperproc(nproc+1)*nanals,mpi_real,0, &   
                     1,mpi_comm_world,ierr)   
    else   
-      allocate(anal_ob(1:nanals,nobstot))   
+      allocate(anal_ob_post(1:nanals,nobstot))   
       allocate(buffertmp3(nanals,nobs_max))   
       do np=1,numproc-1   
          call mpi_recv(buffertmp3,numobsperproc(np+1)*nanals,mpi_real,np, &   
                        1,mpi_comm_world,mpi_status,ierr)   
          do nob1=1,numobsperproc(np+1)   
             nob2 = indxproc_obs(np+1,nob1)   
-            anal_ob(:,nob2) = buffertmp3(:,nob1)   
+            anal_ob_post(:,nob2) = buffertmp3(:,nob1)   
          end do   
       end do   
       do nob1=1,numobsperproc(1)   
          nob2 = indxproc_obs(1,nob1)   
-         anal_ob(:,nob2) = anal_obchunk(:,nob1)   
+         anal_ob_post(:,nob2) = anal_obchunk(:,nob1)   
       end do   
       deallocate(buffertmp3)   
    end if   
diff --git a/src/enkf/enkf_obs_sensitivity.f90 b/src/enkf/enkf_obs_sensitivity.f90
index 481d702288..afa2e242e8 100644
--- a/src/enkf/enkf_obs_sensitivity.f90
+++ b/src/enkf/enkf_obs_sensitivity.f90
@@ -44,7 +44,7 @@ module enkf_obs_sensitivity
 use enkf_obsmod, only: nobstot,nobs_conv,nobs_oz,nobs_sat,obtype,obloclat, &
                        obloclon,obpress,indxsat,oberrvar,stattype,obtime,ob, &
                        ensmean_ob,ensmean_obnobc,obsprd_prior,obfit_prior, &
-                       oberrvar_orig,biaspreds,anal_ob,nobstot,lnsigl, &
+                       oberrvar_orig,biaspreds,anal_ob_post,nobstot,lnsigl, &
                        corrlengthsq,obtimel,oblnp,obloc
 use convinfo, only: convinfo_read,init_convinfo
 use ozinfo, only: ozinfo_read,init_oz
@@ -218,7 +218,7 @@ subroutine read_ob_sens
   allocate(biaspreds(npred+1,nobs_sat))
   allocate(tmpanal_ob(nanals))
   allocate(tmpbiaspreds(npred+1))
-  if(nproc == 0) allocate(anal_ob(nanals,nobstot))
+  if(nproc == 0) allocate(anal_ob_post(nanals,nobstot))
   ! Read loop over conventional observations
   do nob=1,nobs_conv+nobs_oz
      read(iunit) indata,tmpanal_ob
@@ -235,7 +235,7 @@ subroutine read_ob_sens
      oberrvar_orig(nob) = real(indata%oberrvar_orig,r_kind)
      stattype(nob) = indata%stattype
      obtype(nob) = indata%obtype
-     if(nproc == 0) anal_ob(1:nanals,nob) = real(tmpanal_ob(1:nanals),r_kind)
+     if(nproc == 0) anal_ob_post(1:nanals,nob) = real(tmpanal_ob(1:nanals),r_kind)
   end do
   ! Read loop over satellite radiance observations
   nn = 0
@@ -256,7 +256,7 @@ subroutine read_ob_sens
      stattype(nob) = indata%stattype
      obtype(nob) = indata%obtype
      indxsat(nn) = indata%indxsat
-     if(nproc == 0) anal_ob(1:nanals,nob) = real(tmpanal_ob(1:nanals),r_kind)
+     if(nproc == 0) anal_ob_post(1:nanals,nob) = real(tmpanal_ob(1:nanals),r_kind)
      biaspreds(1:npred+1,nn) = real(tmpbiaspreds(1:npred+1),r_kind)
   end do
   if(nn /= nobs_sat) then
@@ -439,7 +439,7 @@ subroutine print_ob_sens
            outdata%osense_dry = 9.9e31_r_single
            outdata%osense_moist = 9.9e31_r_single
         end if
-        tmpanal_ob(1:nanals) = real(anal_ob(1:nanals,nob),r_single)
+        tmpanal_ob(1:nanals) = real(anal_ob_post(1:nanals,nob),r_single)
         write(iunit) outdata,tmpanal_ob
         if(.not. efsoi_flag) cycle
         ! Sum up
@@ -510,7 +510,7 @@ subroutine print_ob_sens
            outdata%osense_dry = 9.9e31_r_single
            outdata%osense_moist = 9.9e31_r_single
         end if
-        tmpanal_ob(1:nanals) = real(anal_ob(1:nanals,nob),r_single)
+        tmpanal_ob(1:nanals) = real(anal_ob_post(1:nanals,nob),r_single)
         write(iunit) outdata,tmpanal_ob,tmpbiaspreds
         if(.not. efsoi_flag) cycle
         ! Sum up
diff --git a/src/enkf/enkf_obsmod.f90 b/src/enkf/enkf_obsmod.f90
index 06984bb080..b5d1206541 100644
--- a/src/enkf/enkf_obsmod.f90
+++ b/src/enkf/enkf_obsmod.f90
@@ -146,6 +146,9 @@ module enkf_obsmod
 type(c_ptr)                             :: anal_ob_modens_cp    ! C pointer
 integer :: shm_win, shm_win2
 
+! ob-space posterior ensemble, needed for EFSOI
+real(r_single),public,allocatable, dimension(:,:) :: anal_ob_post   ! Fortran pointer
+
 contains
 
 subroutine readobs()
@@ -453,6 +456,7 @@ subroutine obsmod_cleanup()
 if (allocated(probgrosserr)) deallocate(probgrosserr)
 if (allocated(prpgerr)) deallocate(prpgerr)
 if (allocated(diagused)) deallocate(diagused)
+if (allocated(anal_ob_post)) deallocate(anal_ob_post)
 ! free shared memory segement, fortran pointer to that memory.
 nullify(anal_ob)
 call MPI_Barrier(mpi_comm_world,ierr)
diff --git a/src/enkf/gridio_gfs.f90 b/src/enkf/gridio_gfs.f90
index 9dada826d9..58ceff78a6 100644
--- a/src/enkf/gridio_gfs.f90
+++ b/src/enkf/gridio_gfs.f90
@@ -40,7 +40,7 @@ module gridio
 !   language: f95
 !
 !$$$
- use constants, only: zero,one,cp,fv,rd,tiny_r_kind,max_varname_length,t0c,r0_05,constants_initialized
+ use constants, only: zero,one,cp,fv,rd,tiny_r_kind,max_varname_length,t0c,r0_05
  use params, only: nlons,nlats,nlevs,use_gfs_nemsio,pseudo_rh, &
                    cliptracers,datapath,imp_physics,use_gfs_ncio,cnvw_option, &
                    nanals
diff --git a/src/enkf/params.f90 b/src/enkf/params.f90
index 5bcfa062b9..e3d50d53be 100644
--- a/src/enkf/params.f90
+++ b/src/enkf/params.f90
@@ -761,10 +761,6 @@ subroutine read_namelist()
 corrlengthtr = corrlengthtr * 1.e3_r_single/rearth
 corrlengthsh = corrlengthsh * 1.e3_r_single/rearth
 
-if(efsoi_cycling) then
-   letkf_flag = .false.
-end if
-
 ! convert targe area boundary into radians
 tar_minlat = tar_minlat * deg2rad
 tar_maxlat = tar_maxlat * deg2rad
diff --git a/src/gsi/constants.f90 b/src/gsi/constants.f90
index caf4331532..da2368d70f 100644
--- a/src/gsi/constants.f90
+++ b/src/gsi/constants.f90
@@ -84,6 +84,10 @@ module constants
   public :: soilmoistmin
   public :: stndrd_atmos_ps
 
+! ------ EFSOI relevant parameters -------- !
+  public :: tref, pref
+  public :: constants_initialized
+
 ! Declare derived constants
   integer(i_kind):: huge_i_kind
   integer(i_kind), parameter :: max_varname_length=32
@@ -109,6 +113,8 @@ module constants
   real(r_kind),parameter::  ttp    = 2.7316e+2_r_kind            !  temperature at h2o triple point (K)
   real(r_kind),parameter::  jcal   = 4.1855e+0_r_kind            !  joules per calorie              ()
   real(r_kind),parameter::  stndrd_atmos_ps = 1013.25e2_r_kind   ! 1976 US standard atmosphere ps   (Pa)
+  real(r_kind),parameter::  tref   = 2.8000e+2_r_kind            ! reference T for total energy
+  real(r_kind),parameter::  pref   = 1.0000e+5_r_kind            ! reference P for total energy
 
 ! Numeric constants
 
@@ -283,6 +289,9 @@ module constants
   integer(i_kind),parameter:: i_missing=-9999
   integer(r_kind),parameter:: r_missing=-9999._r_kind
 
+! Constants initialized
+  logical :: constants_initialized = .true.
+
 contains
 
   subroutine init_constants_derived
diff --git a/util/EFSOI_Utilities/src/CMakeLists.txt b/util/EFSOI_Utilities/src/CMakeLists.txt
new file mode 100644
index 0000000000..49d2a8fdee
--- /dev/null
+++ b/util/EFSOI_Utilities/src/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 2.6)
+if(BUILD_EFSOI)
+
+  file(GLOB LOCAL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*90)
+
+  set_source_files_properties( ${LOCAL_SRC} PROPERTIES COMPILE_FLAGS ${ENKF_Fortran_FLAGS} )
+  
+  add_executable(global_efsoi.x  ${LOCAL_SRC} )
+  
+  SET( CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${OMPFLAG}" )
+  
+  include_directories(${CMAKE_CURRENT_BINARY_DIR} "${PROJECT_BINARY_DIR}/include/global"  ${CMAKE_CURRENT_BINARY_DIR}/.. ${MPI_Fortran_INCLUDE_DIRS} ${MPI_Fortran_INCLUDE_PATH} ${CORE_INCS} ${NETCDF_INCLUDE_DIRS} ${NCDIAG_INCS} ${FV3GFS_NCIO_INCS}) 
+
+  target_link_libraries( global_efsoi.x enkflib enkfdeplib ${GSILIB} ${GSISHAREDLIB} ${CORE_LIBRARIES}
+  ${MPI_Fortran_LIBRARIES} ${LAPACK_LIBRARIES} ${NETCDF_Fortran_LIBRARIES} ${NETCDF_C_LIBRARIES}
+  ${FV3GFS_NCIO_LIBRARIES} 
+  ${EXTRA_LINKER_FLAGS} ${GSI_LDFLAGS} ${CORE_BUILT} ${CORE_LIBRARIES} ${CORE_BUILT} ${NCDIAG_LIBRARIES})  
+endif()
+
+
+
diff --git a/util/EFSOI_Utilities/src/efsoi.f90 b/util/EFSOI_Utilities/src/efsoi.f90
new file mode 100644
index 0000000000..54527c559c
--- /dev/null
+++ b/util/EFSOI_Utilities/src/efsoi.f90
@@ -0,0 +1,329 @@
+module efsoi
+!$$$  module documentation block
+!
+! module: efsoi                        Update observation impact estimates
+!                                      with the EnSRF framework.
+!
+! prgmmr: ota              org: np23                   date: 2011-12-01
+! prgmmr: groff            org: emc                    date: 2018-05-24
+!
+! abstract:
+!  Computes the observation impact estimates using the EnKF analysis products.
+!  Formulation is based on Kalnay et al (2012, Tellus A, submitted).
+!  Parallel processing is following the way of the serial EnSRF.
+!
+! Public Subroutines:
+!  efsoi_update: performs the Forecast Sensitivity to Observations update within
+!              the EnSRF framework. The EnKF other than the serial EnSRF 
+!              (e.g. the LETKF) can be also used here (the method is 
+!              independent on the type of EnKF).
+!
+! Public Variables: None
+!
+! Modules Used: kinds, constants, params, covlocal, mpisetup, loadbal, statevec,
+!               kdtree2_module, obsmod, gridinfo, obs_sensitivity
+!
+! program history log:
+!   2011-12-01  Created from the LETKF core module.
+!   2012-04-04  Changed to EnSRF framework for saving memory
+!   2018-05-24  Renamed module added non-bias corrected EFSOI
+!
+! attributes:
+!   language: f95
+!
+!$$$
+
+use mpisetup
+use covlocal, only:  taper
+use kinds, only: r_double, i_kind, r_kind
+use kdtree2_module, only: kdtree2_r_nearest, kdtree2_result
+use loadbal_efsoi, only: numptsperproc, indxproc, lnp_chunk, kdtree_grid, &
+                   iprocob, indxob_chunk, anal_obchunk_prior, numobsperproc, &
+                   indxproc_obs, nobs_max
+use scatter_chunks_efsoi, only: fcerror_chunk, anal_chunk
+use enkf_obsmod, only: oberrvar, ob, ensmean_ob, obloc, obloclon, obloclat, oblnp, &
+                       obtime, nobstot, corrlengthsq, lnsigl, obtimel, anal_ob
+use constants, only: constants_initialized, pi, zero, one
+use params, only: nanals, corrlengthnh, corrlengthtr, corrlengthsh, &
+                  tar_minlon, tar_maxlon, tar_minlat, tar_maxlat, &
+                  tar_minlev, tar_maxlev
+use gridinfo, only: nlevs_pres, lonsgrd, latsgrd
+use statevec_efsoi, only: id_u, id_v, id_t, id_q, id_ps
+use enkf_obs_sensitivity, only: obsense_kin, obsense_dry, obsense_moist, adloc_chunk
+
+implicit none
+
+private
+public :: efsoi_update
+
+contains
+
+subroutine efsoi_update()
+implicit none
+real(r_kind),allocatable,dimension(:) :: djdy_kin, djdy_dry, djdy_moist
+real(r_kind),allocatable,dimension(:) :: uvwork, tpwork, qwork
+real(r_kind),allocatable,dimension(:) :: taper_disgrd
+real(r_kind),dimension(nobstot) :: obdep, oberinv
+real(r_single),dimension(nobstot) :: invobtimel, invlnsigl
+real(r_single),dimension(nobstot) :: invcorlen
+real(r_kind),allocatable,dimension(:,:) :: buffertmp
+type(kdtree2_result),allocatable,dimension(:) :: sresults1
+real(r_kind) :: anal_obtmp(nanals)
+real(r_kind) :: r,taper1,taper3,taperv
+real(r_single) :: lnsig
+real(r_double) :: t1, t2, t3, t4, tbegin, tend
+integer(i_kind) :: np, nob, nob1, nob2, nobm, npob, nf2, i, ii, npt, nanal, nn
+integer(i_kind) :: nnpt, nsame, nskip, ngrd1
+integer(i_kind) :: ierr
+logical :: kdgrid
+
+
+if (.not. constants_initialized) then
+    print *,'constants not initialized (with init_constants, init_constants_derived)'
+    call stop2(28)
+end if
+
+allocate(sresults1(nlevs_pres*numptsperproc(nproc+1)))
+allocate(taper_disgrd(nlevs_pres*numptsperproc(nproc+1)))
+
+! Compute the inverse of cut-off length and 
+! the observation departure from first guess
+!$omp parallel do private(nob)
+do nob=1,nobstot
+   invcorlen(nob) = one/corrlengthsq(nob)
+   invlnsigl(nob) = one/lnsigl(nob)
+   invobtimel(nob)= one/obtimel(nob)
+   oberinv(nob)   = one/oberrvar(nob)
+   obdep(nob)     = ob(nob)-ensmean_ob(nob)
+end do
+
+
+kdgrid=associated(kdtree_grid)
+allocate(djdy_kin(nobstot))
+allocate(djdy_dry(nobstot))
+allocate(djdy_moist(nobstot))
+djdy_kin(1:nobstot) = zero
+djdy_dry(1:nobstot) = zero
+djdy_moist(1:nobstot) = zero
+allocate(uvwork(nanals))
+allocate(tpwork(nanals))
+allocate(qwork(nanals))
+
+nsame = 0
+nskip = 0
+nobm = 1
+t2 = zero
+t3 = zero
+t4 = zero
+tbegin = mpi_wtime()
+
+! 
+! Loop for each observations
+obsloop: do nob=1,nobstot
+
+
+   if (nproc == 0 .and. modulo(nob , 10000) == 0 ) print *,'doing nob=',nob
+
+   t1 = mpi_wtime()
+
+   if(oberrvar(nob) > 1.e10_r_kind .or. abs(obtime(nob)) >= obtimel(nob))then
+      nskip = nskip + 1
+      cycle obsloop
+   end if
+
+   ! what processor is this ob on?
+   npob = iprocob(nob)
+
+   ! get anal_ob from that processor and send to other processors.
+   if (nproc == npob) then
+      nob1 = indxob_chunk(nob)
+      anal_obtmp(1:nanals) = anal_obchunk_prior(:,nob1)
+   end if
+ 
+   call mpi_bcast(anal_obtmp,nanals,mpi_realkind,npob,mpi_comm_world,ierr)
+   
+   anal_obtmp(1:nanals) = anal_obtmp(1:nanals) * oberinv(nob)
+
+   t2 = t2 + mpi_wtime() - t1
+   t1 = mpi_wtime()
+
+
+
+
+
+   ! ============================================================  
+   ! Only need to recalculate nearest points when lat/lon is different
+   ! ============================================================  
+   if(nob == 1 .or. &
+         abs(obloclat(nob)-obloclat(nobm)) .gt. tiny(obloclat(nob)) .or. &
+         abs(obloclon(nob)-obloclon(nobm)) .gt. tiny(obloclon(nob)) .or. &
+         abs(corrlengthsq(nob)-corrlengthsq(nobm)) .gt. tiny(corrlengthsq(nob))) then
+		 
+      nobm=nob
+      ! determine localization length scales based on latitude of ob.
+      
+      nf2=0
+      ! search analysis grid points for those within corrlength of 
+      ! ob being assimilated (using a kd-tree for speed).
+      if (kdgrid) then
+         call kdtree2_r_nearest(tp=kdtree_grid,qv=obloc(:,nob), &
+              r2=corrlengthsq(nob), &
+              nfound=nf2,nalloc=nlevs_pres*numptsperproc(nproc+1), &
+              results=sresults1)
+      else
+         ! use brute force search if number of grid points on this proc <= 3
+         do nn=1,nlevs_pres
+            do npt=1,numptsperproc(nproc+1)
+               nnpt = nlevs_pres * (npt-1) + nn
+               r = sum( (obloc(:,nob)-adloc_chunk(:,nnpt))**2, 1 )
+               if (r < corrlengthsq(nob)) then
+                  nf2 = nf2 + 1
+                  sresults1(nf2)%idx = nnpt
+                  sresults1(nf2)%dis = r
+               end if
+            end do
+         end do
+      end if
+	  
+	  !$omp parallel do private(nob1)
+      do nob1=1,nf2
+         taper_disgrd(nob1) = taper(sqrt(sresults1(nob1)%dis*invcorlen(nob)))
+      end do
+	  
+   else
+      nsame=nsame+1
+   end if
+
+   t3 = t3 + mpi_wtime() - t1
+   t1 = mpi_wtime()
+
+   ! ============================================================  
+   ! forecast error sensitivity to the observations
+   ! ============================================================  
+   if (nf2 > 0) then
+	   
+      taper3=taper(obtime(nob)*invobtimel(nob))
+	  
+      uvwork(1:nanals) = zero
+      tpwork(1:nanals) = zero
+      qwork(1:nanals)  = zero
+	  
+      ! rho * X_f^T (e_t|0 + e_t|-6) / 2
+      ! contributions from (U,V), (T,Ps) and Q are computed separately
+      
+      do ii=1,nf2
+         taper1=taper_disgrd(ii)*taper3
+		 
+         i = (sresults1(ii)%idx-1)/nlevs_pres+1
+		 
+         ngrd1=indxproc(nproc+1,i)
+		 
+         if(tar_minlon <= tar_maxlon .and. &
+              & (lonsgrd(ngrd1) < tar_minlon .or. lonsgrd(ngrd1) > tar_maxlon .or. &
+              & latsgrd(ngrd1) < tar_minlat .or. latsgrd(ngrd1) > tar_maxlat)) &
+              & cycle
+			  
+         if(tar_minlon > tar_maxlon .and. &
+              & ((lonsgrd(ngrd1) < tar_minlon .and. lonsgrd(ngrd1) > tar_maxlon) .or. &
+              & latsgrd(ngrd1) < tar_minlat .or. latsgrd(ngrd1) > tar_maxlat)) &
+              & cycle
+			  
+         nn = sresults1(ii)%idx - (i-1)*nlevs_pres
+		 
+         if((tar_minlev /= 1 .or. nn /= nlevs_pres) &
+              & .and. (nn > tar_maxlev .or. nn < tar_minlev)) cycle
+         lnsig = abs(lnp_chunk(i,nn)-oblnp(nob))
+		 
+         if(lnsig < lnsigl(nob))then
+            taperv=taper1*taper(lnsig*invlnsigl(nob))
+         else
+            cycle
+         end if
+		 
+         if(nn == nlevs_pres) then
+            do nanal=1,nanals
+               tpwork(nanal) = tpwork(nanal) + taperv &
+                    & * anal_chunk(nanal,i,id_ps) * fcerror_chunk(i,id_ps)
+            end do
+            cycle
+         end if
+
+
+         ! 		 
+         do nanal=1,nanals
+            uvwork(nanal) = uvwork(nanal) + taperv &
+                 & * (anal_chunk(nanal,i,id_u(nn)) * fcerror_chunk(i,id_u(nn)) &
+                 & + anal_chunk(nanal,i,id_v(nn)) * fcerror_chunk(i,id_v(nn)))
+				 
+            tpwork(nanal) = tpwork(nanal) + taperv &
+                 & * anal_chunk(nanal,i,id_t(nn)) * fcerror_chunk(i,id_t(nn))
+				 
+            if(id_q(nn) > 0) qwork(nanal) = qwork(nanal) + taperv &
+                 & * anal_chunk(nanal,i,id_q(nn)) * fcerror_chunk(i,id_q(nn))
+         end do
+		 
+      end do
+	  
+      ! R^-1 HX_a [rho * X_f^T (e_t|0 + e_t|-6) / 2]
+      do nanal=1,nanals
+         djdy_kin(nob) 		= djdy_kin(nob)   + anal_obtmp(nanal) * uvwork(nanal)
+         djdy_dry(nob) 		= djdy_dry(nob)   + anal_obtmp(nanal) * tpwork(nanal)
+         djdy_moist(nob) 	= djdy_moist(nob) + anal_obtmp(nanal) * qwork(nanal)
+      end do
+	  
+   end if ! end of, if (nf2 > 0) then
+  
+   
+   t4 = t4 + mpi_wtime() - t1
+   t1 = mpi_wtime()
+
+end do obsloop
+
+tend = mpi_wtime()
+if (nproc == 0 .or. nproc == numproc-1) print *,'time to process FSO on gridpoint = ',tend-tbegin,t2,t3,t4,' secs'
+
+if (nproc == 0 .and. nskip > 0) print *,nskip,' out of',nobstot,'obs skipped'
+if (nproc == 0 .and. nsame > 0) print *,nsame,' out of', nobstot-nskip,' same lat/lon'
+
+! Observation sensitivity post process
+!$omp parallel do private(nob)
+do nob=1,nobstot
+   djdy_dry(nob) 		= djdy_dry(nob)   + djdy_kin(nob)
+   djdy_moist(nob) 		= djdy_moist(nob) + djdy_dry(nob)
+   obsense_kin(nob) 	= djdy_kin(nob)   * obdep(nob)
+   obsense_dry(nob) 	= djdy_dry(nob)   * obdep(nob)
+   obsense_moist(nob) 	= djdy_moist(nob) * obdep(nob)
+end do
+
+! Gathering analysis perturbations projected on the observation space
+if(nproc /= 0) then
+  call mpi_send(anal_obchunk_prior,numobsperproc(nproc+1)*nanals,mpi_real4,0, &
+       1,mpi_comm_world,ierr)
+else
+   allocate(anal_ob(1:nanals,nobstot))
+   allocate(buffertmp(nanals,nobs_max))
+   do np=1,numproc-1
+      call mpi_recv(buffertmp,numobsperproc(np+1)*nanals,mpi_real4,np, &
+           1,mpi_comm_world,mpi_status,ierr)
+      do nob1=1,numobsperproc(np+1)
+         nob2 = indxproc_obs(np+1,nob1)
+         anal_ob(:,nob2) = buffertmp(:,nob1)
+      end do
+   end do
+   do nob1=1,numobsperproc(1)
+      nob2 = indxproc_obs(1,nob1)
+      anal_ob(:,nob2) = anal_obchunk_prior(:,nob1)
+   end do
+   deallocate(buffertmp)
+end if
+
+t1 = mpi_wtime() - tend
+if (nproc == 0) print *,'time to observation sensitivity post process = ',t1,' secs'
+
+deallocate(anal_obchunk_prior)
+deallocate(sresults1)
+deallocate(djdy_kin,djdy_dry,djdy_moist,taper_disgrd,uvwork,tpwork,qwork)
+
+return
+end subroutine efsoi_update
+end module efsoi
diff --git a/util/EFSOI_Utilities/src/efsoi_main.f90 b/util/EFSOI_Utilities/src/efsoi_main.f90
new file mode 100644
index 0000000000..e4e6307441
--- /dev/null
+++ b/util/EFSOI_Utilities/src/efsoi_main.f90
@@ -0,0 +1,166 @@
+program efsoi_main
+!$$$  main program documentation block
+!
+! program:  efsoi_main                  high level driver program for 
+!                                       efsoi calculations.
+!
+! prgmmr: Ota         org: EMC/JMA               date: 2012
+! prgmmr: Groff       org: EMC                   date: 2018
+!
+! abstract: This is the main program for EFSOI code. It does the following:
+!           a) initialize MPI, read EFSOI namelist from efsoi.nml on each task.
+!           b) reads observation sensitivity files.
+!           c) read horizontal grid information (lat/lon of each grid point) and
+!              pressure at each grid point/vertical level.
+!           d) decomposition of horizontal grid points and observation
+!              priors to minimize load imbalance for EFSOI calcs.
+!           e) read forecast and analysis states necessary for
+!              EFSOI calculations.
+!           f) Initialize/allocate for EFSOI moist/dry/kinetic
+!              total output
+!           g) Estimate the location of observation response
+!              (i.e. advect the localization).  Default estimation
+!              approach is to multiply meridional and zonal wind
+!              by 0.75.
+!           h) Perform EFSOI calculations for all observations
+!              considered for assimilation during the EnSRF update.
+!           i) write EFSOI calculations and ancillary EFSOI
+!              information to file
+!           j) deallocate all allocatable arrays, finalize MPI.
+!
+! program history log:
+!   2018-04-30  Initial development. Adaptation of enkf_main program
+!               towards EFSOI calculation process
+!
+! usage:
+!   input files: Update to FV3 nomenclature
+!     sigfAT_YYYYMMDDHH_mem* - Ensemble of forecasts valid at advance time
+!                              (AT) hours beyond the initial time
+!     sigfAT_YYYYMMDDHH_ensmean - Ensemble mean forecast AT hours from
+!                                 initial time
+!     sigfAT+6_YYYYMMDDHH-6_ensmean - Ensemble mean forecast AT+6 hours
+!                                     from initial time minus 6 hours
+!     siganl.YYYYMMDDHH.gdas - 
+!   output files: 
+!     obimpact_YYYYMMDDHH.dat - observation impact file
+!                         
+! comments: This program is a wrapper for components needed to peform
+!           EFSOI calculations
+!
+! attributes:
+!   language: f95
+!
+!$$$
+
+ use kinds, only: r_double,i_kind
+ ! reads namelist parameters.
+ ! applying enkf namelist apparatus
+ use params, only : read_namelist,nanals
+ ! mpi functions and variables.
+ use mpisetup, only:  mpi_initialize, mpi_initialize_io, mpi_cleanup, nproc, &
+                      mpi_wtime, mpi_comm_world
+ ! model state vector 
+ use statevec_efsoi, only: read_state_efsoi, statevec_cleanup_efsoi, init_statevec_efsoi
+ ! load balancing
+ use loadbal_efsoi, only: load_balance_efsoi, loadbal_cleanup_efsoi
+ ! efsoi update
+ use efsoi, only: efsoi_update
+ ! Observation sensitivity usage
+ use enkf_obs_sensitivity, only: init_ob_sens, print_ob_sens, destroy_ob_sens, read_ob_sens
+ use loc_advection, only: loc_advection_efsoi
+ ! Scatter chunks for EFSOI
+ use scatter_chunks_efsoi, only: scatter_chunks_ob_impact
+ use omp_lib, only: omp_get_max_threads
+ ! Needed for rad fixed files 
+ use radinfo, only: init_rad, init_rad_vars 
+ 
+ implicit none
+ integer :: ierr, nth
+ real(r_double) t1,t2
+
+ ! 0. initialize MPI.
+ call mpi_initialize()
+ if (nproc==0) call w3tagb('EFSOI_CALC',2021,0319,0055,'NP25')
+
+ call init_rad()
+
+ ! 1. read namelist.
+ call read_namelist()
+ 
+ ! 2. initialize MPI communicator for IO tasks.
+ call mpi_initialize_io(nanals)
+
+ ! 3. Initialize rad vars
+ call init_rad_vars()
+
+ ! 4. read the necessary inputs for the EFSOI calculation from file
+ t1 = mpi_wtime()
+ call read_ob_sens()
+ t2 = mpi_wtime()
+ if (nproc == 0) print *, 'time in read_ob_sens = ',t2-t1,'on proc', nproc
+
+ nth= omp_get_max_threads()
+ if(nproc== 0)write(6,*) 'efsoi_main:  number of threads ',nth
+
+ ! 5. Halt processors until all are completed
+ call mpi_barrier(mpi_comm_world, ierr)
+
+ ! 6. Initialize state vector information
+ call init_statevec_efsoi()
+
+ ! 7. read in ensemble forecast members,
+ ! valid at the evaluation forecast
+ ! time, distribute pieces to each task.
+ t1 = mpi_wtime()
+ call read_state_efsoi()
+ t2 = mpi_wtime()
+ if (nproc == 0) print *,'time in read_state_efsoi =',t2-t1,'on proc',nproc
+
+ ! 8. do load balancing (partitioning of grid points
+ ! and observations among processors)
+ t1 = mpi_wtime()
+ call load_balance_efsoi()
+ t2 = mpi_wtime()
+ if (nproc == 0) print *,'time in load_balance_efsoi =',t2-t1,'on proc',nproc
+
+ ! 9. apply scattering of efsoi chunks
+ t1 = mpi_wtime()
+ call scatter_chunks_ob_impact()  ! ensemble scattering
+ t2 = mpi_wtime()
+ if (nproc == 0) print *,'time to scatter observation impact chunks =',t2-t1,'on proc',nproc
+
+ ! 10. Initialize EFSOI variables
+ t1 = mpi_wtime()
+ call init_ob_sens()
+ t2 = mpi_wtime() 
+ if (nproc == 0) print *,'time to allocate ob sensitivity variables =',t2-t1,'on proc',nproc
+
+ ! 11. Calculate the estimated location of observation
+ ! response at evaluation time
+ t1 = mpi_wtime()
+ call loc_advection_efsoi()
+ t2 = mpi_wtime()
+ if (nproc == 0) print *,'time in loc_advection_efsoi =',t2-t1,'on proc',nproc
+
+ ! 12. Perform the EFSOI calcs
+ t1 = mpi_wtime()
+ call efsoi_update()
+ t2 = mpi_wtime()
+ if (nproc == 0) print *,'time in efsoi_update =',t2-t1,'on proc',nproc
+
+ ! 13. print EFSOI sensitivity i/o on root task.
+ t1 = mpi_wtime()
+ call print_ob_sens()
+ t2 = mpi_wtime()
+ if (nproc == 0) print *,'time needed to write observation impact file =',t2-t1,'on proc',nproc
+
+ ! 14. Cleanup for EFSOI configuration
+ call statevec_cleanup_efsoi()
+ call loadbal_cleanup_efsoi()
+ call destroy_ob_sens()
+
+ ! 15. finalize MPI.
+ if (nproc==0) call w3tage('EFSOI_CALC')
+ call mpi_cleanup()
+
+end program efsoi_main
diff --git a/util/EFSOI_Utilities/src/loadbal_efsoi.f90 b/util/EFSOI_Utilities/src/loadbal_efsoi.f90
new file mode 100644
index 0000000000..778468a469
--- /dev/null
+++ b/util/EFSOI_Utilities/src/loadbal_efsoi.f90
@@ -0,0 +1,380 @@
+module loadbal_efsoi
+!$$$  module documentation block
+!
+! module: loadbal          decompose ob priors and horizontal grid points
+!                          to minimize load imbalance. Creates various
+!                          arrays that define the decomposition.
+!
+! prgmmr: whitaker         org: esrl/psd               date: 2009-02-23
+!         groff            org: emc                    date: 2018-09-03
+! abstract: 
+!
+! Public Subroutines:
+!  load_balance_efsoi: set up decomposition (for ob priors and analysis grid points)
+!   that minimizes load imbalance.  
+!   The decomposition uses "Graham's rule", which simply
+!   stated, assigns each new work item to the task that currently has the 
+!   smallest load.
+!  loadbal_cleanup_efsoi: deallocate allocated arrays.
+!
+! Private Subroutines:
+!  estimate_work_efsoi: estimate work needed to update each analysis grid
+!   point (considering all the observations within the localization radius).
+! Public Variables (all defined by subroutine load_balance_efsoi):
+!  npts_min: (integer scalar) smallest number of grid points assigned to a task.
+!  npts_max: (integer scalar) maximum number of grid points assigned to a task.
+!  nobs_min: (integer scalar, serial enkf only) smallest number of observation priors assigned to a task.
+!  nobs_max: (integer scalar, serial enkf only) maximum number of observation priors assigned to a task.
+!  numproc: (integer scalar) total number of MPI tasks (from module mpisetup)
+!  nobstot: (integer scalar) total number of obs to be assimilated (from module
+!   enkf_obsmod).
+!  numobsperproc(numproc): (serial enkf only) integer array containing # of ob priors 
+!   assigned to each task.
+!  numptsperproc(numproc): integer array containing # of grid points assigned to
+!   each task.
+!  indxproc(numproc,npts_max): integer array with the indices (1,2,...npts) of 
+!   analysis grid points assigned to each task.
+!  indxproc_obs(numproc,nobs_max): (serial enkf only) integer array with the indices
+!   (1,2,...nobstot) of observation priors assigned to that task.
+!  iprocob(nobstot): (serial enkf only) integer array containing the task number that has been
+!   assigned to update each observation prior.
+!  indxob_chunk(nobstot): (serial enkf only) integer array that maps the index of the ob priors
+!   being assimilated (1,2,3...nobstot) to the index of the obs being 
+!   updated on that task (1,2,3,...numobsperproc(nproc)) - inverse of
+!   indxproc_obs.
+!  ensmean_obchunk(nobs_max): (serial enkf only) real array of ensemble mean observation priors
+!   being updated on that task (use indxproc_obs to find
+!   corresponding index in ensemble_ob).
+!  obloc_chunk(3,nobs_max): (serial enkf only) real array of spherical cartesian coordinates
+!   of ob priors being updated on this task.
+!  grdloc_chunk(3,npts_max): real array of spherical cartesian coordinates
+!   of analysis grid points being updated on this task.
+!  lnp_chunk(npts_max,ncdim): real array of log(pressures) of control variables
+!   being updated on this task.
+!  oblnp_chunk(nobs_max,ncdim): (serial enkf only) real array of log(pressures) of ob priors
+!   being updated on this task.
+!  obtime_chunk(nobs_max): (serial enkf only) real array of ob times of ob priors
+!   being updated on this task (expressed as an offset from the analysis time in
+!   hours).
+!  anal_obchunk_prior(nanals,nobs_max): (serial enkf only) real array of observation prior 
+!   ensemble perturbations to be updated on this task (not used in LETKF).
+!  kdtree_grid: pointer to kd-tree structure used for nearest neighbor searches
+!   for model grid points (only searches grid points assigned to this task).
+!  kdtree_obs: pointer to kd-tree structure used for nearest neighbor searches
+!   for observations (only searches ob locations assigned to this task).
+!  kdtree_obs2: (LETKF only) pointer to kd-tree structure used for nearest neighbor searches
+!   for observations (searches all observations)
+!  anal_chunk(nanals,npts_max,ncdim,nbackgrounds): real array of ensemble perturbations
+!   updated on each task.
+!  anal_chunk_prior(nanals,npts_max,ncdim,nbackgrounds): real array of prior ensemble
+!   perturbations.  Before analysis anal_chunk=anal_chunk_prior, after
+!   analysis anal_chunk contains posterior perturbations.
+!  ensmean_chunk(npts_max,ncdim,nbackgrounds): real array containing pieces of ensemble
+!   mean to be updated on each task.
+!  ensmean_chunk_prior(npts_max,ncdim,nbackgrounds): as above, for ensemble mean prior.
+!   Before analysis ensmean_chunk=ensmean_chunk_prior, after analysis
+!   ensmean_chunk contains posterior ensemble mean.
+!   
+!
+! Modules Used: mpisetup, params, kinds, constants, enkf_obsmod, gridinfo,
+!               kdtree_module, covlocal, controlvec
+!
+! program history log:
+!   2009-02-23  Initial version.
+!   2011-06-21  Added the option of observation box selection for LETKF.
+!   2015-07-25  Remove observation box selection (use kdtree instead).
+!   2016-05-02  shlyaeva: modification for reading state vector from table
+!   2016-09-07  shlyaeva: moved distribution of chunks here from controlvec
+!
+! attributes:
+!   language: f95
+!
+!$$$
+
+use mpisetup
+use params, only: datapath, nanals, simple_partition, &
+                  corrlengthnh, corrlengthsh, corrlengthtr, lupd_obspace_serial,&
+                  efsoi_flag
+use enkf_obsmod, only: nobstot, obloc, oblnp, ensmean_ob, obtime, anal_ob, corrlengthsq
+use kinds, only: r_kind, i_kind, r_double, r_single
+use kdtree2_module, only: kdtree2, kdtree2_create, kdtree2_destroy, &
+                          kdtree2_result, kdtree2_r_nearest
+use gridinfo, only: gridloc, logp, latsgrd, nlevs_pres, npts
+use constants, only: zero, rad2deg, deg2rad
+
+implicit none
+private
+public :: load_balance_efsoi, loadbal_cleanup_efsoi
+
+real(r_single),public, allocatable, dimension(:,:) :: lnp_chunk, &
+                                                      anal_obchunk_prior
+real(r_single),public, allocatable, dimension(:,:,:) :: anal_chunk, anal_chunk_prior
+real(r_single),public, allocatable, dimension(:,:) :: ensmean_chunk, ensmean_chunk_prior
+
+! arrays passed to kdtree2 routines need to be single
+real(r_single),public, allocatable, dimension(:,:) :: obloc_chunk, grdloc_chunk
+real(r_single),public, allocatable, dimension(:) :: oblnp_chunk, &
+ obtime_chunk, ensmean_obchunk
+integer(i_kind),public, allocatable, dimension(:) :: iprocob, indxob_chunk,&
+                          numptsperproc, numobsperproc
+integer(i_kind),public, allocatable, dimension(:,:) :: indxproc, indxproc_obs
+integer(i_kind),public :: npts_min, npts_max, nobs_min, nobs_max
+integer(8) totsize
+! kd-tree structures.
+type(kdtree2),public,pointer :: kdtree_obs, kdtree_grid, kdtree_obs2
+
+contains
+
+subroutine load_balance_efsoi()
+! set up decomposition (for analysis grid points, and ob priors in serial EnKF)
+! that minimizes load imbalance. 
+! Uses "Graham's rule", which simply
+! stated, assigns each new work item to the task that currently has the 
+! smallest load.
+implicit none
+integer(i_kind), allocatable, dimension(:) :: rtmp,numobs
+integer(i_kind) np,i,n,nn,nob1,nob2,ierr
+real(r_double) t1
+logical test_loadbal
+
+! partition state vector for using Grahams rule..
+! ("When a new job arrives, allocate it to the server 
+! that currently has the smallest load")
+allocate(numobs(npts))
+allocate(numptsperproc(numproc))
+allocate(rtmp(numproc))
+t1 = mpi_wtime()
+! assume work load proportional to number of 'nearby' obs
+call estimate_work_efsoi(numobs) ! fill numobs array with number of obs per horiz point
+! distribute the results of estimate_work to all processors.
+call mpi_allreduce(mpi_in_place,numobs,npts,mpi_integer,mpi_sum,mpi_comm_world,ierr)
+if (nproc == 0) print *,'time in estimate_work_efsoi = ',mpi_wtime()-t1,' secs'
+if (nproc == 0) print *,'min/max numobs',minval(numobs),maxval(numobs)
+! loop over horizontal grid points on analysis grid.
+t1 = mpi_wtime()
+rtmp = 0
+numptsperproc = 0
+np = 0
+test_loadbal = .false. ! simple partition for testing
+! 
+!PRINT *, npts, 'npts load bal'
+do n=1,npts
+   if (test_loadbal) then
+       ! use simple partition (does not use estimated workload) for testing
+       np = np + 1
+       if (np > numproc) np = 1
+   else
+       np = minloc(rtmp,dim=1)
+       ! np is processor with the fewest number of obs to process
+       ! add this grid point to list for nmin
+       rtmp(np) = rtmp(np)+numobs(n)
+   endif
+   numptsperproc(np) = numptsperproc(np)+1
+   !PRINT *, numptsperproc, 'loadbal nptsperpr'
+    
+
+
+end do
+
+
+npts_max = maxval(numptsperproc)
+npts_min = minval(numptsperproc)
+
+allocate(indxproc(numproc,npts_max))
+! indxproc(np,i) is i'th horiz grid index for processor np.
+! there are numptsperpoc(np) i values for processor np
+rtmp = 0
+numptsperproc = 0
+np = 0
+do n=1,npts
+   if (test_loadbal) then
+       ! use simple partition (does not use estimated workload) for testing
+       np = np + 1
+       if (np > numproc) np = 1
+   else
+       np = minloc(rtmp,dim=1)
+       rtmp(np) = rtmp(np)+numobs(n)
+   endif
+   numptsperproc(np) = numptsperproc(np)+1 ! recalculate
+   indxproc(np,numptsperproc(np)) = n
+end do
+
+!PRINT *, numptsperproc, 'second time'
+! print estimated workload for each task
+if (nproc == 0) then
+   do np=1,numproc
+      rtmp(np) = 0
+      do n=1,numptsperproc(np)
+         rtmp(np) = rtmp(np) + numobs(indxproc(np,n))
+      enddo
+	  
+  	  
+   enddo
+   
+   print *,'min/max estimated work ',&
+   minval(rtmp),maxval(rtmp)
+
+endif
+deallocate(rtmp,numobs)
+
+if (nproc == 0) then
+    print *,'npts = ',npts
+    print *,'min/max number of points per proc = ',npts_min,npts_max
+    print *,'time to do model space decomp = ',mpi_wtime()-t1
+end if
+
+! setup arrays to hold subsets of grid information for each task.
+allocate(grdloc_chunk(3,numptsperproc(nproc+1)))
+allocate(lnp_chunk(numptsperproc(nproc+1),nlevs_pres))
+do i=1,numptsperproc(nproc+1)
+   grdloc_chunk(:,i) = gridloc(:,indxproc(nproc+1,i))
+   do nn=1,nlevs_pres
+      lnp_chunk(i,nn) = logp(indxproc(nproc+1,i),nn)
+   end do
+end do
+
+allocate(numobsperproc(numproc))
+allocate(iprocob(nobstot))
+! default is to partition obs simply, since
+! speed up from using Graham's rule for observation process
+! often does not justify cost of estimating workload in ob space.
+
+! distribute obs without trying to estimate workload
+t1 = mpi_wtime()
+numobsperproc = 0
+np=0
+do n=1,nobstot
+   np=np+1
+   if(np > numproc)np = 1
+   numobsperproc(np) = numobsperproc(np)+1
+   iprocob(n) = np-1
+enddo
+nobs_min = minval(numobsperproc)
+nobs_max = maxval(numobsperproc)
+allocate(indxproc_obs(numproc,nobs_max))
+numobsperproc = 0
+do n=1,nobstot
+   np=iprocob(n)+1
+   numobsperproc(np) = numobsperproc(np)+1 ! recalculate
+   ! indxproc_obs(np,i) is i'th ob index for processor np.
+   ! there are numobsperpoc(np) i values for processor np
+   indxproc_obs(np,numobsperproc(np)) = n
+end do
+if (nproc == 0) then
+   print *,'nobstot = ',nobstot
+   print *,'min/max number of obs per proc = ',nobs_min,nobs_max
+   print *,'time to do ob space decomp = ',mpi_wtime()-t1
+end if
+! for serial enkf, send out observation priors to be updated on each processor.
+allocate(anal_obchunk_prior(nanals,nobs_max))
+if(nproc == 0) then
+   print *,'sending out observation prior ensemble perts from root ...'
+   totsize = nobstot
+   totsize = totsize*nanals
+   print *,'nobstot*nanals',totsize
+   t1 = mpi_wtime()
+   ! send one big message to each task.
+   do np=1,numproc-1
+      do nob1=1,numobsperproc(np+1)
+         nob2 = indxproc_obs(np+1,nob1)
+         anal_obchunk_prior(1:nanals,nob1) = anal_ob(1:nanals,nob2)
+      end do
+      call mpi_send(anal_obchunk_prior,nobs_max*nanals,mpi_real4,np, &
+           1,mpi_comm_world,ierr)
+   end do
+   ! anal_obchunk_prior on root (no send necessary)
+   do nob1=1,numobsperproc(1)
+      nob2 = indxproc_obs(1,nob1)
+      anal_obchunk_prior(1:nanals,nob1) = anal_ob(1:nanals,nob2)
+   end do
+      ! now we don't need anal_ob anymore for serial EnKF.
+      if (.not. lupd_obspace_serial) deallocate(anal_ob)
+else
+   ! recv one large message on each task.
+   call mpi_recv(anal_obchunk_prior,nobs_max*nanals,mpi_real4,0, &
+        1,mpi_comm_world,mpi_status,ierr)
+end if
+call mpi_barrier(mpi_comm_world, ierr)
+if(nproc == 0) print *,'... took ',mpi_wtime()-t1,' secs'
+! these arrays only needed for serial filter
+! nob1 is the index of the obs to be processed on this rank
+! nob2 maps nob1 to 1:nobstot array (nobx)
+allocate(obloc_chunk(3,numobsperproc(nproc+1)))
+allocate(oblnp_chunk(numobsperproc(nproc+1)))
+allocate(obtime_chunk(numobsperproc(nproc+1)))
+allocate(ensmean_obchunk(numobsperproc(nproc+1)))
+allocate(indxob_chunk(nobstot))
+indxob_chunk = -1
+do nob1=1,numobsperproc(nproc+1)
+   nob2 = indxproc_obs(nproc+1,nob1)
+   oblnp_chunk(nob1) = oblnp(nob2)
+   obtime_chunk(nob1) = obtime(nob2)
+   indxob_chunk(nob2) = nob1
+   ensmean_obchunk(nob1) = ensmean_ob(nob2)
+   obloc_chunk(:,nob1) = obloc(:,nob2)
+enddo
+! set up kd-tree for efsoi to search only the subset
+! of gridpoints, obs to be updated on this processor..
+if (numptsperproc(nproc+1) >= 3 .and. .not. lupd_obspace_serial .and. .not. efsoi_flag) then
+   kdtree_grid => kdtree2_create(grdloc_chunk,sort=.false.,rearrange=.true.)
+endif
+if (numobsperproc(nproc+1) >= 3) then
+   kdtree_obs  => kdtree2_create(obloc_chunk,sort=.false.,rearrange=.true.)
+end if
+
+end subroutine load_balance_efsoi
+
+subroutine estimate_work_efsoi(numobs)
+! estimate work needed to update each analysis grid
+! point (considering all the observations within the localization radius).
+use covlocal, only:  latval
+
+implicit none
+integer(i_kind), dimension(:), intent(inout) :: numobs
+real(r_single) :: deglat,corrlength,corrsq
+type(kdtree2_result),dimension(:),allocatable :: sresults
+
+integer nob,n1,n2,i,ideln
+
+ideln = int(real(npts)/real(numproc))
+n1 = 1 + nproc*ideln
+n2 = (nproc+1)*ideln
+if (nproc == numproc-1) n2 = npts
+
+! loop over 'good' obs.
+numobs = 1 ! set min # of obs to 1, not 0 (so single ob test behaves)
+!$omp parallel do  schedule(dynamic,1) private(nob,i,deglat,corrlength,sresults,corrsq)
+obsloop: do i=n1,n2
+       do nob=1,nobstot
+          if (sum((obloc(1:3,nob)-gridloc(1:3,i))**2,1) < corrlengthsq(nob)) &
+            numobs(i) = numobs(i) + 1
+       end do 
+end do obsloop
+!$omp end parallel do
+
+end subroutine estimate_work_efsoi
+
+subroutine loadbal_cleanup_efsoi()
+! deallocate module-level allocatable arrays
+if (allocated(anal_chunk)) deallocate(anal_chunk)
+if (allocated(anal_chunk_prior)) deallocate(anal_chunk_prior)
+if (allocated(ensmean_chunk)) deallocate(ensmean_chunk)
+if (allocated(ensmean_chunk_prior)) deallocate(ensmean_chunk_prior)
+if (allocated(obloc_chunk)) deallocate(obloc_chunk)
+if (allocated(grdloc_chunk)) deallocate(grdloc_chunk)
+if (allocated(lnp_chunk)) deallocate(lnp_chunk)
+if (allocated(oblnp_chunk)) deallocate(oblnp_chunk)
+if (allocated(obtime_chunk)) deallocate(obtime_chunk)
+if (allocated(ensmean_obchunk)) deallocate(ensmean_obchunk)
+if (allocated(iprocob)) deallocate(iprocob)
+if (allocated(indxob_chunk)) deallocate(indxob_chunk)
+if (allocated(indxproc_obs)) deallocate(indxproc_obs)
+if (allocated(numptsperproc)) deallocate(numptsperproc)
+if (allocated(numobsperproc)) deallocate(numobsperproc)
+if (allocated(indxproc)) deallocate(indxproc)
+if (associated(kdtree_obs)) call kdtree2_destroy(kdtree_obs)
+if (associated(kdtree_obs2)) call kdtree2_destroy(kdtree_obs2)
+if (associated(kdtree_grid)) call kdtree2_destroy(kdtree_grid)
+end subroutine loadbal_cleanup_efsoi
+
+end module loadbal_efsoi
diff --git a/util/EFSOI_Utilities/src/loc_advection.f90 b/util/EFSOI_Utilities/src/loc_advection.f90
new file mode 100644
index 0000000000..f81f4e0665
--- /dev/null
+++ b/util/EFSOI_Utilities/src/loc_advection.f90
@@ -0,0 +1,100 @@
+module loc_advection
+
+use mpisetup
+use gridinfo, only: latsgrd,lonsgrd,nlevs_pres,npts
+use statevec_efsoi, only: id_u,id_v
+use loadbal_efsoi, only: numptsperproc, indxproc, kdtree_grid
+use scatter_chunks_efsoi
+use kinds
+use params
+use constants
+use kdtree2_module
+implicit none
+
+private
+public loc_advection_efsoi, adloc_chunk
+real(r_single),allocatable,dimension(:,:) :: adloc_chunk
+
+contains
+
+subroutine loc_advection_efsoi
+!$$$  subprogram documentation block
+!                .      .    .                                       .
+! subprogram:    loc_advection
+!   prgmmr: ota
+!
+! abstract: computes analysis grid point location with simple horizontal
+!   advection.
+!
+! program history log:
+!   2011-09-16  ota    - created
+!
+!   input argument list:
+!
+!   output argument list:
+!
+! attributes:
+!   language: f90
+!   machine:
+!
+!$$$ end documentation block
+  implicit none
+  real(r_kind),allocatable,dimension(:) :: coslat
+  real(r_kind) :: adtime, halfpi, pi2, adlon, adlat
+  integer(i_kind) :: npt, nn, nnu, nnv, nnpt
+  allocate(adloc_chunk(3,numptsperproc(nproc+1)*nlevs_pres))
+  allocate(coslat(numptsperproc(nproc+1)))
+  adtime = adrate*real(eft,r_kind)*3600._r_kind/rearth
+  halfpi = half * pi
+  pi2 = 2._r_kind * pi
+!$omp parallel private(npt)
+!$omp do
+  do npt=1,numptsperproc(nproc+1)
+     coslat(npt) = 1._r_kind/cos(latsgrd(indxproc(nproc+1,npt)))
+  end do
+!$omp end do
+!$omp end parallel
+!$omp parallel private(nn,nnu,nnv,npt,nnpt,adlon,adlat)
+!$omp do
+  do nn=1,nlevs_pres
+     if(nn == nlevs_pres) then
+        nnu = id_u(1)
+        nnv = id_v(1)
+     else
+        nnu = id_u(nn)
+        nnv = id_v(nn)
+     end if
+     do npt=1,numptsperproc(nproc+1)
+        nnpt = nlevs_pres * (npt-1) + nn
+        adlon = lonsgrd(indxproc(nproc+1,npt)) &
+             & - half * (ensmean_chunk(npt,nnu) + analmean_chunk(npt,nnu)) &
+             & * coslat(npt) * adtime
+        adlat = latsgrd(indxproc(nproc+1,npt)) &
+             & - half * (ensmean_chunk(npt,nnv) + analmean_chunk(npt,nnv)) &
+             & * adtime
+        if(adlat > halfpi) then
+           adlat = pi - adlat
+           adlon = adlon + pi
+        else if(adlat < -halfpi) then
+           adlat = -pi - adlat
+           adlon = adlon + pi
+        end if
+        if(adlon > pi2) then
+           adlon = mod(adlon,pi2)
+        else if(adlon < zero) then
+           adlon = mod(adlon,pi2) + pi2
+        end if
+        adloc_chunk(1,nnpt) = cos(adlat)*cos(adlon)
+        adloc_chunk(2,nnpt) = cos(adlat)*sin(adlon)
+        adloc_chunk(3,nnpt) = sin(adlat)
+     end do
+  end do
+!$omp end do
+!$omp end parallel
+  deallocate(coslat)
+  ! kd-tree
+  kdtree_grid => kdtree2_create(adloc_chunk,sort=.false.,rearrange=.true.)
+  return
+end subroutine loc_advection_efsoi
+
+end module loc_advection