From e4f180ce5873db809575672fe785ed740517ae14 Mon Sep 17 00:00:00 2001
From: Stuart Slattery <slatterysr@ornl.gov>
Date: Fri, 21 Dec 2018 12:46:40 -0500
Subject: [PATCH 1/3] Adding base communication plan

Co-authored-by: dalg24 <dalg24@users.noreply.github.com>
---
 core/src/CMakeLists.txt                       |  50 +-
 core/src/Cabana_CommunicationPlan.hpp         | 610 ++++++++++++++++++
 core/unit_test/CMakeLists.txt                 |  20 +-
 .../Cuda/tstCommunicationPlan_Cuda.cpp        |  13 +
 .../OpenMP/tstCommunicationPlan_OpenMP.cpp    |  13 +
 .../Pthread/tstCommunicationPlan_Pthread.cpp  |  13 +
 .../Serial/tstCommunicationPlan_Serial.cpp    |  13 +
 core/unit_test/tstCommunicationPlan.hpp       | 535 +++++++++++++++
 8 files changed, 1259 insertions(+), 8 deletions(-)
 create mode 100644 core/src/Cabana_CommunicationPlan.hpp
 create mode 100644 core/unit_test/Cuda/tstCommunicationPlan_Cuda.cpp
 create mode 100644 core/unit_test/OpenMP/tstCommunicationPlan_OpenMP.cpp
 create mode 100644 core/unit_test/Pthread/tstCommunicationPlan_Pthread.cpp
 create mode 100644 core/unit_test/Serial/tstCommunicationPlan_Serial.cpp
 create mode 100644 core/unit_test/tstCommunicationPlan.hpp

diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt
index 79127b084..dd5678dee 100644
--- a/core/src/CMakeLists.txt
+++ b/core/src/CMakeLists.txt
@@ -2,21 +2,57 @@ configure_file(CabanaCore_config.hpp.cmakein CabanaCore_config.hpp)
 
 #-----------------------------------------------------------------------------
 
-file(GLOB HEADERS_PUBLIC *.hpp)
-file(GLOB SOURCES *.cpp)
+set(HEADERS_PUBLIC
+  Cabana_AoSoA.hpp
+  Cabana_Core.hpp
+  Cabana_DeepCopy.hpp
+  Cabana_ExecutionPolicy.hpp
+  Cabana_LinkedCellList.hpp
+  Cabana_Macros.hpp
+  Cabana_MemberTypes.hpp
+  Cabana_NeighborList.hpp
+  Cabana_Parallel.hpp
+  Cabana_Slice.hpp
+  Cabana_SoA.hpp
+  Cabana_Sort.hpp
+  Cabana_Tuple.hpp
+  Cabana_Types.hpp
+  Cabana_VerletList.hpp
+  Cabana_Version.hpp
+  )
 
 #-----------------------------------------------------------------------------
+# MPI-dependent code
+if(Cabana_ENABLE_MPI)
+  list(APPEND HEADERS_PUBLIC
+    Cabana_CommunicationPlan.hpp
+    Cabana_Distributor.hpp
+    Cabana_Halo.hpp
+    )
+endif()
 
-file(GLOB HEADERS_IMPL impl/*.hpp)
-file(GLOB SOURCES_IMPL impl/*.cpp)
+#-----------------------------------------------------------------------------
+# implementation details
+set(HEADERS_IMPL
+  impl/Cabana_CartesianGrid.hpp
+  impl/Cabana_Index.hpp
+  impl/Cabana_IndexSequence.hpp
+  impl/Cabana_PerformanceTraits.hpp
+  impl/Cabana_TypeTraits.hpp
+  )
+set(SOURCES_IMPL
+  impl/Cabana_Version.cpp
+  )
+
+#-----------------------------------------------------------------------------
 
-install(FILES ${HEADERS_PUBLIC} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) 
+install(FILES ${HEADERS_PUBLIC} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 install(FILES ${HEADERS_IMPL} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/impl/)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/CabanaCore_config.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) 
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/CabanaCore_config.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 #-----------------------------------------------------------------------------
 
-add_library(cabanacore ${SOURCES} ${SOURCES} ${SOURCES_IMPL})
+add_library(cabanacore ${SOURCES_IMPL})
 target_include_directories(cabanacore PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
 target_link_libraries(cabanacore Kokkos::kokkos)
 if(Cabana_ENABLE_MPI)
diff --git a/core/src/Cabana_CommunicationPlan.hpp b/core/src/Cabana_CommunicationPlan.hpp
new file mode 100644
index 000000000..07c1c0abc
--- /dev/null
+++ b/core/src/Cabana_CommunicationPlan.hpp
@@ -0,0 +1,610 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#ifndef CABANA_COMMUNICATIONPLAN_HPP
+#define CABANA_COMMUNICATIONPLAN_HPP
+
+#include <Kokkos_Core.hpp>
+
+#include <mpi.h>
+
+#include <type_traits>
+#include <vector>
+#include <exception>
+#include <algorithm>
+#include <numeric>
+#include <cassert>
+
+namespace Cabana
+{
+//---------------------------------------------------------------------------//
+/*!
+  \class CommunicationPlan
+
+  \brief Communication plan base class.
+
+  \tparam MemorySpace Memory space in which the data for this class will be
+  allocated.
+
+  The communication plan computes how to redistribute elements in a parallel
+  data structure using MPI. Given a list of data elements on the local MPI
+  rank and their destination ranks, the communication plan computes which rank
+  each process is sending and receiving from and how many elements we will
+  send and receive. In addition, it provides an export steering vector which
+  describes how to pack the local data to be exported into contiguous send
+  buffers for each destination rank (in the forward communication plan).
+
+  Some nomenclature:
+
+  Export - elements we are sending in the forward communication plan.
+
+  Import - elements we are receiving in the forward communication plan.
+
+  \note If a communication plan does self-sends (i.e. exports and imports data
+  from its own ranks) then this data is first in the data structure. What this
+  means is that neighbor 0 is the local rank and the data for that rank that
+  is being exported will appear first in the steering vector.
+*/
+template<class MemorySpace>
+class CommunicationPlan
+{
+  public:
+
+    // Cabana memory space.
+    using memory_space = MemorySpace;
+
+    // Kokkos execution space.
+    using execution_space = typename memory_space::execution_space;
+
+    /*!
+      \brief Constructor.
+
+      \param comm The MPI communicator over which the distributor is defined.
+
+      \return The MPI communicator for this plan.
+    */
+    CommunicationPlan( MPI_Comm comm )
+        : _comm( comm )
+    {}
+
+    /*!
+      \brief Get the MPI communicator.
+    */
+    MPI_Comm comm() const
+    { return _comm; }
+
+    /*!
+      \brief Get the number of neighbor ranks that this rank will communicate
+      with.
+
+      \return The number of MPI ranks that will exchange data with this rank.
+    */
+    int numNeighbor() const
+    { return _neighbors.size(); }
+
+    /*!
+      \brief Given a local neighbor id get its rank in the MPI communicator.
+
+      \param neighbor The local id of the neighbor to get the rank for.
+
+      \return The MPI rank of the neighbor with the given local id.
+    */
+    int neighborRank( const int neighbor ) const
+    { return _neighbors[neighbor]; }
+
+    /*!
+      \brief Get the number of elements this rank will export to a given neighbor.
+
+      \param neighbor The local id of the neighbor to get the number of
+      exports for.
+
+      \return The number of elements this rank will export to the neighbor with the
+      given local id.
+     */
+    std::size_t numExport( const int neighbor ) const
+    { return _num_export[neighbor]; }
+
+    /*!
+      \brief Get the total number of exports this rank will do.
+
+      \return The total number of elements this rank will export to its
+      neighbors.
+    */
+    std::size_t totalNumExport() const
+    { return _total_num_export; }
+
+    /*!
+      \brief Get the number of elements this rank will import from a given neighbor.
+
+      \param neighbor The local id of the neighbor to get the number of
+      imports for.
+
+      \return The number of elements this rank will import from the neighbor
+      with the given local id.
+     */
+    std::size_t numImport( const int neighbor ) const
+    { return _num_import[neighbor]; }
+
+    /*!
+      \brief Get the total number of imports this rank will do.
+
+      \return The total number of elements this rank will import from its
+      neighhbors.
+    */
+    std::size_t totalNumImport() const
+    { return _total_num_import; }
+
+    /*!
+      \brief Get the number of export elements.
+
+      Whenever the communciation plan is applied, this is the total number of
+      elements expected to be input on the sending ranks (in the forward
+      communication plan). This will be different than the number returned by
+      totalNumExport() if some of the export ranks used in the construction
+      are -1 and therefore will not particpate in an export operation.
+
+      \return The number of export elements.
+    */
+    std::size_t exportSize() const
+    { return _num_export_element; }
+
+    /*!
+      \brief Get the steering vector for the exports.
+
+      \return The steering vector for the exports.
+
+      The steering vector places exports in contiguous chunks by destination
+      rank. The chunks are in consecutive order based on the local neighbor id
+      (i.e. all elements going to neighbor with local id 0 first, then all
+      elements going to neighbor with local id 1, etc.).
+    */
+    Kokkos::View<std::size_t*,memory_space> getExportSteering() const
+    { return _export_steering; }
+
+    // The functions in the public block below would normally be protected but
+    // we make them public to allow using private class data in CUDA kernels
+    // with lambda functions.
+  public:
+
+    /*!
+      \brief Neighbor and export rank creator. Use this when you already know
+      which ranks neighbor each other (i.e. every rank already knows who they
+      will be sending and receiving from) as it will be more efficient. In
+      this case you already know the topology of the point-to-point
+      communication but not how much data to send to and receive from the
+      neighbors.
+
+      \param element_export_ranks The destination rank in the target
+      decomposition of each locally owned element in the source
+      decomposition. Each element will have one unique destination to which it
+      will be exported. This export rank may be any one of the listed neighbor
+      ranks which can include the calling rank. An export rank of -1 will
+      signal that this element is *not* to be exported and will be ignored in
+      the data migration. The input is expected to be a Kokkos view or Cabana
+      slice in the same memory space as the communication plan.
+
+      \param mpi_tag The MPI tag to use for non-blocking communication in the
+      communication plan generation.
+
+      \param neighbor_ranks List of ranks this rank will send to and receive
+      from. This list can include the calling rank. This is effectively a
+      description of the topology of the point-to-point communication plan.
+
+      \note Calling this function completely updates the state of this object
+      and invalidates the previous state.
+
+      \note For elements that you do not wish to export, use an export rank of
+      -1 to signal that this element is *not* to be exported and will be
+      ignored in the data migration. In other words, this element will be
+      *completely* removed in the new decomposition. If the data is staying on
+      this rank, just use this rank as the export destination and the data
+      will be efficiently migrated.
+    */
+    template<class ViewType>
+    void createFromExportsAndTopology(
+        const ViewType& element_export_ranks,
+        const std::vector<int>& neighbor_ranks,
+        const int mpi_tag = 1221 )
+    {
+        // Store the number of export elements.
+        _num_export_element = element_export_ranks.size();
+
+        // Store the neighbors.
+        _neighbors = neighbor_ranks;
+        int num_n = _neighbors.size();
+
+        // Get the MPI rank we are currently on.
+        int my_rank = -1;
+        MPI_Comm_rank( _comm, &my_rank );
+
+        // If we are sending to ourself put that one first in the neighbor
+        // list.
+        for ( auto& n : _neighbors )
+            if ( n == my_rank )
+            {
+                std::swap( n, _neighbors[0] );
+                break;
+            }
+
+        // Initialize import/export sizes.
+        _num_export.assign( num_n, 0 );
+        _num_import.assign( num_n, 0 );
+
+        // Copy the topology to the device.
+        Kokkos::View<int*,Kokkos::HostSpace,Kokkos::MemoryUnmanaged>
+            topology_host( _neighbors.data(), num_n );
+        auto topology = Kokkos::create_mirror_view_and_copy(
+            memory_space(), topology_host );
+
+        // Count the number of sends to each neighbor.
+        Kokkos::View<std::size_t*,Kokkos::HostSpace,Kokkos::MemoryUnmanaged>
+            num_export_host( _num_export.data(), num_n );
+        auto export_counts = Kokkos::create_mirror_view_and_copy(
+                memory_space(), num_export_host );
+        auto count_neighbor_func =
+            KOKKOS_LAMBDA( const int i )
+            {
+                for ( int n = 0; n < num_n; ++n )
+                    if ( topology(n) == element_export_ranks(i) )
+                        Kokkos::atomic_increment( &export_counts(n) );
+            };
+        Kokkos::RangePolicy<execution_space> count_neighbor_policy(
+            0, _num_export_element );
+        Kokkos::parallel_for( "Cabana::CommunicationPlan::count_neighbors",
+                              count_neighbor_policy,
+                              count_neighbor_func );
+        Kokkos::fence();
+
+        // Copy counts back to the host.
+        Kokkos::deep_copy( num_export_host, export_counts );
+
+        // Post receives for the number of imports we will get.
+        std::vector<MPI_Request> requests;
+        requests.reserve( num_n );
+        for ( int n = 0; n < num_n; ++n )
+            if ( my_rank != _neighbors[n] )
+            {
+                requests.push_back( MPI_Request() );
+                MPI_Irecv( &_num_import[n],
+                           1,
+                           MPI_UNSIGNED_LONG,
+                           _neighbors[n],
+                           mpi_tag,
+                           _comm,
+                           &(requests.back()) );
+            }
+            else
+                _num_import[n] = _num_export[n];
+
+        // Send the number of exports to each of our neighbors.
+        for ( int n = 0; n < num_n; ++n )
+            if ( my_rank != _neighbors[n] )
+                MPI_Send( &_num_export[n],
+                          1,
+                          MPI_UNSIGNED_LONG,
+                          _neighbors[n],
+                          mpi_tag,
+                          _comm );
+
+        // Wait on receives.
+        std::vector<MPI_Status> status( requests.size() );
+        const int ec =
+            MPI_Waitall( requests.size(), requests.data(), status.data() );
+        assert( MPI_SUCCESS == ec );
+
+        // Get the total number of imports/exports.
+        _total_num_export =
+            std::accumulate( _num_export.begin(), _num_export.end(), 0 );
+        _total_num_import =
+            std::accumulate( _num_import.begin(), _num_import.end(), 0 );
+
+        // Barrier before continuing to ensure synchronization.
+        MPI_Barrier( _comm );
+    }
+
+    /*!
+      \brief Export rank creator. Use this when you don't know who you will
+      receiving from - only who you are sending to. This is less efficient
+      than if we already knew who our neighbors were because we have to
+      determine the topology of the point-to-point communication first.
+
+      \param element_export_ranks The destination rank in the target
+      decomposition of each locally owned element in the source
+      decomposition. Each element will have one unique destination to which it
+      will be exported. This export rank may any one of the listed neighbor
+      ranks which can include the calling rank. An export rank of -1 will
+      signal that this element is *not* to be exported and will be ignored in
+      the data migration. The input is expected to be a Kokkos view or Cabana
+      slice in the same memory space as the communication plan.
+
+      \param mpi_tag The MPI tag to use for non-blocking communication in the
+      communication plan generation.
+
+      \note Calling this function completely updates the state of this object
+      and invalidates the previous state.
+
+      \note For elements that you do not wish to export, use an export rank of
+      -1 to signal that this element is *not* to be exported and will be
+      ignored in the data migration. In other words, this element will be
+      *completely* removed in the new decomposition. If the data is staying on
+      this rank, just use this rank as the export destination and the data
+      will be efficiently migrated.
+    */
+    template<class ViewType>
+    void createFromExportsOnly( const ViewType& element_export_ranks,
+                                const int mpi_tag = 1221 )
+    {
+        // Store the number of export elements.
+        _num_export_element = element_export_ranks.size();
+
+        // Get the size of this communicator.
+        int comm_size = -1;
+        MPI_Comm_size( _comm, &comm_size );
+
+        // Get the MPI rank we are currently on.
+        int my_rank = -1;
+        MPI_Comm_rank( _comm, &my_rank );
+
+        // Count the number of sends this rank will do to other ranks.
+        Kokkos::View<int*,memory_space> neighbor_counts(
+            "neighbor_counts", comm_size );
+        auto count_sends_func =
+            KOKKOS_LAMBDA( const int i )
+            {
+                if ( element_export_ranks(i) >= 0 )
+                    Kokkos::atomic_increment(
+                        &neighbor_counts(element_export_ranks(i)) );
+            };
+        Kokkos::RangePolicy<execution_space> count_sends_policy(
+            0, _num_export_element );
+        Kokkos::parallel_for( "Cabana::CommunicationPlan::count_sends",
+                              count_sends_policy,
+                              count_sends_func );
+        Kokkos::fence();
+
+        // Copy the counts to the host.
+        auto neighbor_counts_host = Kokkos::create_mirror_view_and_copy(
+            Kokkos::HostSpace(), neighbor_counts );
+
+        // Extract the export ranks and number of exports and then flag the
+        // send ranks.
+        _neighbors.clear();
+        _num_export.clear();
+        _total_num_export = 0;
+        for ( int r = 0; r < comm_size; ++r )
+            if ( neighbor_counts_host(r) > 0 )
+            {
+                _neighbors.push_back( r );
+                _num_export.push_back( neighbor_counts_host(r) );
+                _total_num_export += neighbor_counts_host(r);
+                neighbor_counts_host(r) = 1;
+            }
+
+        // Get the number of export ranks and initially allocate the import sizes.
+        int num_export_rank = _neighbors.size();
+        _num_import.assign( num_export_rank, 0 );
+
+        // If we are sending to ourself put that one first in the neighbor
+        // list and assign the number of imports to be the number of exports.
+        bool self_send = false;
+        for ( int n = 0; n < num_export_rank; ++n )
+            if ( _neighbors[n] == my_rank )
+            {
+                std::swap( _neighbors[n], _neighbors[0] );
+                std::swap( _num_export[n], _num_export[0] );
+                _num_import[0] = _num_export[0];
+                self_send = true;
+                break;
+            }
+
+        // Determine how many total import ranks each neighbor has.
+        std::vector<int> total_receives( comm_size );
+        int root_rank = 0;
+        MPI_Reduce( neighbor_counts_host.data(),
+                    total_receives.data(),
+                    comm_size,
+                    MPI_INT,
+                    MPI_SUM,
+                    root_rank,
+                    _comm );
+        int num_import_rank = -1;
+        MPI_Scatter( total_receives.data(),
+                     1,
+                     MPI_INT,
+                     &num_import_rank,
+                     1,
+                     MPI_INT,
+                     root_rank,
+                     _comm );
+        if ( self_send ) --num_import_rank;
+
+        // Post the expected number of receives and indicate we might get them
+        // from any rank.
+        std::vector<std::size_t> import_sizes( num_import_rank );
+        std::vector<MPI_Request> requests( num_import_rank );
+        for ( int n = 0; n < num_import_rank; ++n )
+            MPI_Irecv( &import_sizes[n],
+                       1,
+                       MPI_UNSIGNED_LONG,
+                       MPI_ANY_SOURCE,
+                       mpi_tag,
+                       _comm,
+                       &requests[n] );
+
+        // Do blocking sends. Dont do any self sends.
+        int self_offset = (self_send) ? 1 : 0;
+        for ( int n = self_offset; n < num_export_rank; ++n )
+            MPI_Send( &_num_export[n],
+                      1,
+                      MPI_UNSIGNED_LONG,
+                      _neighbors[n],
+                      mpi_tag,
+                      _comm );
+
+        // Wait on non-blocking receives.
+        std::vector<MPI_Status> status( requests.size() );
+        const int ec =
+            MPI_Waitall( requests.size(), requests.data(), status.data() );
+        assert( MPI_SUCCESS == ec );
+
+        // Compute the total number of imports.
+        _total_num_import = std::accumulate(
+            import_sizes.begin(), import_sizes.end(),
+            (self_send) ? _num_import[0] : 0 );
+
+        // Extract the imports. If we did self sends we already know what
+        // imports we got from that.
+        for ( int i = 0; i < num_import_rank; ++i )
+        {
+            // Get the message source.
+            const auto source = status[i].MPI_SOURCE;
+
+            // See if the neighbor we received stuff from was someone we also
+            // sent stuff to. If it was, just record what they sent us.
+            auto found_neighbor =
+                std::find( _neighbors.begin(), _neighbors.end(), source );
+
+            // If this is a new neighbor (i.e. someone we didn't send anything
+            // to) record this. Otherwise add it to the one we found.
+            if ( found_neighbor == std::end(_neighbors) )
+            {
+                _neighbors.push_back( source );
+                _num_import.push_back( import_sizes[i] );
+                _num_export.push_back( 0 );
+            }
+            else
+            {
+                _num_import[i+self_offset] = import_sizes[i];
+            }
+        }
+
+        // Barrier before continuing to ensure synchronization.
+        MPI_Barrier( _comm );
+    }
+
+    /*!
+      \brief Create the export steering vector.
+
+      Creates an array describing which export element ids are moved to which
+      location in the send buffer of the communcation plan. Ordered such that
+      if a rank sends to itself then those values come first.
+
+      \param element_export_ranks The ranks to which we are exporting each
+      element. We use this to build the steering vector. The input is expected
+      to be a Kokkos view or Cabana slice in the same memory space as the
+      communication plan.
+    */
+    template<class ViewType>
+    void createExportSteering( const ViewType& element_export_ranks )
+    {
+        // passing in element_export_ranks here as a dummy argument.
+        createSteering( true, element_export_ranks, element_export_ranks );
+    }
+
+    /*!
+      \brief Create the export steering vector.
+
+      Creates an array describing which export element ids are moved to which
+      location in the contiguous send buffer of the communcation plan. Ordered
+      such that if a rank sends to itself then those values come first.
+
+      \param element_export_ranks The ranks to which we are exporting each
+      element. We use this to build the steering vector. The input is expected
+      to be a Kokkos view or Cabana slice in the same memory space as the
+      communication plan.
+
+      \param element_export_ids The local ids of the elements to be
+      exported. This corresponds with the export ranks vector and must be the
+      same length if defined. The input is expected to be a Kokkos view or
+      Cabana slice in the same memory space as the communication plan.
+    */
+    template<class RankViewType, class IdViewType>
+    void createExportSteering(
+        const RankViewType& element_export_ranks,
+        const IdViewType& element_export_ids )
+    {
+        createSteering( false, element_export_ranks, element_export_ids );
+    }
+
+    // Create the export steering vector.
+    template<class RankViewType,class IdViewType>
+    void createSteering(
+        const bool use_iota,
+        const RankViewType& element_export_ranks,
+        const IdViewType& element_export_ids )
+    {
+        if ( !use_iota &&
+             (element_export_ids.size() != element_export_ranks.size()) )
+            throw std::runtime_error("Export ids and ranks different sizes!");
+
+        // Calculate the steering offsets via exclusive prefix sum for the
+        // exports.
+        int num_n = _neighbors.size();
+        Kokkos::View<std::size_t*,Kokkos::HostSpace>
+            offsets_host( "offsets", num_n );
+        for ( int n = 1; n < num_n; ++n )
+            offsets_host(n) = offsets_host(n-1) + _num_export[n-1];
+
+        // Copy the offsets to the device.
+        auto offsets = Kokkos::create_mirror_view_and_copy(
+            memory_space(), offsets_host );
+
+        // Copy the neighbors to the device.
+        Kokkos::View<int*,Kokkos::HostSpace,Kokkos::MemoryUnmanaged>
+            neighbor_ranks_host( _neighbors.data(), num_n );
+        auto neighbor_ranks = Kokkos::create_mirror_view_and_copy(
+            memory_space(), neighbor_ranks_host );
+
+        // Create the export steering vector for writing local elements into
+        // the send buffer. Note we create a local, shallow copy - this is a
+        // CUDA workaround.
+        _export_steering = Kokkos::View<std::size_t*,memory_space>(
+            Kokkos::ViewAllocateWithoutInitializing("export_steering"),
+            _total_num_export );
+        auto steer_vec = _export_steering;
+        Kokkos::View<std::size_t*,memory_space> counts( "counts", num_n );
+        auto steer_func =
+            KOKKOS_LAMBDA( const int i )
+            {
+                for ( int n = 0; n < num_n; ++n )
+                    if ( element_export_ranks(i) == neighbor_ranks(n) )
+                    {
+                        auto c = Kokkos::atomic_fetch_add( &counts(n), 1 );
+                        steer_vec( offsets(n) + c ) =
+                            (use_iota) ? i : element_export_ids(i);
+                        break;
+                    }
+            };
+        Kokkos::RangePolicy<execution_space> steer_policy(
+            0, element_export_ranks.size() );
+        Kokkos::parallel_for( "Cabana::createSteering",
+                              steer_policy,
+                              steer_func );
+        Kokkos::fence();
+    }
+
+  private:
+
+    MPI_Comm _comm;
+    std::vector<int> _neighbors;
+    std::size_t _total_num_export;
+    std::size_t _total_num_import;
+    std::vector<std::size_t> _num_export;
+    std::vector<std::size_t> _num_import;
+    std::size_t _num_export_element;
+    Kokkos::View<std::size_t*,memory_space> _export_steering;
+};
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Cabana
+
+#endif // end CABANA_COMMUNICATIONPLAN_HPP
diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt
index 7237a8193..9ff442914 100644
--- a/core/unit_test/CMakeLists.txt
+++ b/core/unit_test/CMakeLists.txt
@@ -22,7 +22,7 @@ foreach(_test Version Index CartesianGrid SoA)
 endforeach()
 
 ##--------------------------------------------------------------------------##
-## Tests.
+## On-node tests.
 ##--------------------------------------------------------------------------##
 foreach(_device ${CABANA_SUPPORTED_DEVICES})
   if(Cabana_ENABLE_${_device})
@@ -40,3 +40,21 @@ foreach(_device ${CABANA_SUPPORTED_DEVICES})
     endforeach()
   endif()
 endforeach()
+
+##--------------------------------------------------------------------------##
+## MPI tests.
+##--------------------------------------------------------------------------##
+if(${Cabana_ENABLE_MPI})
+  foreach(_device ${CABANA_SUPPORTED_DEVICES})
+    if(Cabana_ENABLE_${_device})
+      foreach(_test CommunicationPlan Distributor Halo)
+        add_executable(${_test}_test_${_device} ${_device}/tst${_test}_${_device}.cpp mpi_unit_test_main.cpp)
+        target_link_libraries(${_test}_test_${_device} cabanacore cabana_core_gtest)
+        foreach(_np 1 ${MPIEXEC_MAX_NUMPROCS})
+          add_test(NAME ${_test}_test_${_device}_${_np} COMMAND
+            ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${_np} ${_test}_test_${_device} --gtest_color=yes)
+        endforeach()
+      endforeach()
+    endif()
+  endforeach()
+endif()
diff --git a/core/unit_test/Cuda/tstCommunicationPlan_Cuda.cpp b/core/unit_test/Cuda/tstCommunicationPlan_Cuda.cpp
new file mode 100644
index 000000000..5961ea2a6
--- /dev/null
+++ b/core/unit_test/Cuda/tstCommunicationPlan_Cuda.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Cuda/TestCuda_Category.hpp>
+#include <tstCommunicationPlan.hpp>
diff --git a/core/unit_test/OpenMP/tstCommunicationPlan_OpenMP.cpp b/core/unit_test/OpenMP/tstCommunicationPlan_OpenMP.cpp
new file mode 100644
index 000000000..dc5e28f99
--- /dev/null
+++ b/core/unit_test/OpenMP/tstCommunicationPlan_OpenMP.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <OpenMP/TestOpenMP_Category.hpp>
+#include <tstCommunicationPlan.hpp>
diff --git a/core/unit_test/Pthread/tstCommunicationPlan_Pthread.cpp b/core/unit_test/Pthread/tstCommunicationPlan_Pthread.cpp
new file mode 100644
index 000000000..178f0c9de
--- /dev/null
+++ b/core/unit_test/Pthread/tstCommunicationPlan_Pthread.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Pthread/TestPthread_Category.hpp>
+#include <tstCommunicationPlan.hpp>
diff --git a/core/unit_test/Serial/tstCommunicationPlan_Serial.cpp b/core/unit_test/Serial/tstCommunicationPlan_Serial.cpp
new file mode 100644
index 000000000..47728992d
--- /dev/null
+++ b/core/unit_test/Serial/tstCommunicationPlan_Serial.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Serial/TestSerial_Category.hpp>
+#include <tstCommunicationPlan.hpp>
diff --git a/core/unit_test/tstCommunicationPlan.hpp b/core/unit_test/tstCommunicationPlan.hpp
new file mode 100644
index 000000000..ec284aade
--- /dev/null
+++ b/core/unit_test/tstCommunicationPlan.hpp
@@ -0,0 +1,535 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Cabana_CommunicationPlan.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <gtest/gtest.h>
+
+#include <mpi.h>
+
+#include <vector>
+#include <algorithm>
+
+namespace Test
+{
+//---------------------------------------------------------------------------//
+class CommPlanTester : public Cabana::CommunicationPlan<TEST_MEMSPACE>
+{
+  public:
+
+    CommPlanTester( MPI_Comm comm )
+        : Cabana::CommunicationPlan<TEST_MEMSPACE>(comm)
+    {}
+
+    template<class ViewType>
+    void createFromExportsAndNeighbors(
+        const ViewType& element_export_ranks,
+        const std::vector<int>& neighbor_ranks )
+    {
+        this->createFromExportsAndTopology(
+            element_export_ranks, neighbor_ranks );
+    }
+
+    template<class ViewType>
+    void createFromExports( const ViewType& element_export_ranks )
+    {
+        this->createFromExportsOnly( element_export_ranks );
+    }
+
+    template<class ViewType>
+    void createSteering( const ViewType& element_export_ranks )
+    {
+        this->createExportSteering( element_export_ranks );
+    }
+
+    template<class RankViewType, class IdViewType>
+    void createSteering(
+        const RankViewType& element_export_ranks,
+        const IdViewType& element_export_ids )
+    {
+        this->createExportSteering(
+            element_export_ranks, element_export_ids );
+    }
+};
+
+//---------------------------------------------------------------------------//
+void test1( const bool use_topology )
+{
+    // Make a communication plan.
+    CommPlanTester comm_plan( MPI_COMM_WORLD );
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Every rank will communicate with itself and send all of its data.
+    int num_data = 10;
+    Kokkos::View<int*,TEST_MEMSPACE>
+        export_ranks( "export_ranks", num_data );
+    Kokkos::deep_copy( export_ranks, my_rank );
+    std::vector<int> neighbor_ranks( 1, my_rank );
+
+    // Create the plan.
+    if ( use_topology )
+        comm_plan.createFromExportsAndNeighbors( export_ranks, neighbor_ranks );
+    else
+        comm_plan.createFromExports( export_ranks );
+
+    // Check the plan.
+    EXPECT_EQ( comm_plan.numNeighbor(), 1 );
+    EXPECT_EQ( comm_plan.neighborRank(0), my_rank );
+    EXPECT_EQ( comm_plan.numExport(0), num_data );
+    EXPECT_EQ( comm_plan.totalNumExport(), num_data );
+    EXPECT_EQ( comm_plan.numImport(0), num_data );
+    EXPECT_EQ( comm_plan.totalNumImport(), num_data );
+
+    // Create the export steering vector.
+    comm_plan.createSteering( export_ranks );
+
+    // Check the steering vector. We thread the creation of the steering
+    // vector so we don't really know what order it is in - only that it is
+    // grouped by the ranks to which we are exporting. In this case just sort
+    // the steering vector and make sure all of the ids are there. We can do
+    // this because we are only sending to one rank.
+    auto steering = comm_plan.getExportSteering();
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), steering );
+    std::sort( host_steering.data(),
+               host_steering.data() + host_steering.size() );
+    EXPECT_EQ( host_steering.size(), num_data );
+    for ( int n = 0; n < num_data; ++n )
+        EXPECT_EQ( n, host_steering(n) );
+}
+
+//---------------------------------------------------------------------------//
+void test2( const bool use_topology )
+{
+    // Make a communication plan.
+    CommPlanTester comm_plan( MPI_COMM_WORLD );
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Every rank will communicate with itself and send every other piece of
+    // data.
+    int num_data = 10;
+    Kokkos::View<int*,Kokkos::HostSpace> export_ranks_host(
+        "export_ranks", num_data );
+    for ( int n = 0; n < num_data; ++n )
+        export_ranks_host(n) = ( 0 == n%2 ) ? my_rank : -1;
+    auto export_ranks = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ranks_host );
+    std::vector<int> neighbor_ranks( 1, my_rank );
+
+    // Create the plan
+    if ( use_topology )
+        comm_plan.createFromExportsAndNeighbors( export_ranks, neighbor_ranks );
+    else
+        comm_plan.createFromExports( export_ranks );
+
+    // Check the plan.
+    EXPECT_EQ( comm_plan.numNeighbor(), 1 );
+    EXPECT_EQ( comm_plan.neighborRank(0), my_rank );
+    EXPECT_EQ( comm_plan.numExport(0), num_data / 2 );
+    EXPECT_EQ( comm_plan.totalNumExport(), num_data / 2 );
+    EXPECT_EQ( comm_plan.numImport(0), num_data / 2);
+    EXPECT_EQ( comm_plan.totalNumImport(), num_data / 2 );
+
+    // Create the export steering vector.
+    Kokkos::View<std::size_t*,Kokkos::HostSpace>
+        export_ids_host( "export_ids", export_ranks.size() );
+    std::iota( export_ids_host.data(),
+               export_ids_host.data() + export_ranks.size(),
+               0 );
+    auto element_export_ids = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ids_host );
+    comm_plan.createSteering( export_ranks, element_export_ids );
+
+    // Check the steering vector.  We thread the creation of the steering
+    // vector so we don't really know what order it is in - only that it is
+    // grouped by the ranks to which we are exporting. In this case just sort
+    // the steering vector and make sure all of the ids are there. We can do
+    // this because we are only sending to one rank.
+    auto steering = comm_plan.getExportSteering();
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), steering );
+    std::sort( host_steering.data(),
+               host_steering.data() + host_steering.size() );
+    EXPECT_EQ( host_steering.size(), num_data / 2 );
+    for ( int n = 0; n < num_data / 2; ++n )
+        EXPECT_EQ( n * 2, host_steering(n) );
+}
+
+//---------------------------------------------------------------------------//
+void test3( const bool use_topology )
+{
+    // Make a communication plan.
+    CommPlanTester comm_plan( MPI_COMM_WORLD );
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get my size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Compute the inverse rank.
+    int inverse_rank = my_size - my_rank - 1;
+
+    // Every rank will communicate with the rank that is its inverse.
+    int num_data = 10;
+    Kokkos::View<int*,TEST_MEMSPACE>
+        export_ranks( "export_ranks", num_data );
+    Kokkos::deep_copy( export_ranks, inverse_rank );
+    std::vector<int> neighbor_ranks( 1, inverse_rank );
+
+    // Create the plan with both export ranks and the topology.
+    if ( use_topology )
+        comm_plan.createFromExportsAndNeighbors( export_ranks, neighbor_ranks );
+    else
+        comm_plan.createFromExports( export_ranks );
+
+    // Check the plan.
+    EXPECT_EQ( comm_plan.numNeighbor(), 1 );
+    EXPECT_EQ( comm_plan.neighborRank(0), inverse_rank );
+    EXPECT_EQ( comm_plan.numExport(0), num_data );
+    EXPECT_EQ( comm_plan.totalNumExport(), num_data );
+    EXPECT_EQ( comm_plan.numImport(0), num_data );
+    EXPECT_EQ( comm_plan.totalNumImport(), num_data );
+
+    // Create the export steering vector.
+    comm_plan.createSteering( export_ranks );
+
+    // Check the steering vector. We thread the creation of the steering
+    // vector so we don't really know what order it is in - only that it is
+    // grouped by the ranks to which we are exporting. In this case just sort
+    // the steering vector and make sure all of the ids are there. We can do
+    // this because we are only sending to one rank.
+    auto steering = comm_plan.getExportSteering();
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), steering );
+    std::sort( host_steering.data(),
+               host_steering.data() + host_steering.size() );
+    for ( int n = 0; n < num_data; ++n )
+        EXPECT_EQ( n, host_steering(n) );
+}
+
+//---------------------------------------------------------------------------//
+void test4( const bool use_topology )
+{
+    // Make a communication plan.
+    CommPlanTester comm_plan( MPI_COMM_WORLD );
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get my size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Every rank will communicate with all other ranks. Interleave the sends.
+    int num_data = 2 * my_size;
+    Kokkos::View<int*,Kokkos::HostSpace> export_ranks_host(
+        "export_ranks", num_data );
+    std::vector<int> neighbor_ranks( my_size );
+    for ( int n = 0; n < my_size; ++n )
+    {
+        export_ranks_host[n] = n;
+        export_ranks_host[n + my_size] = n;
+        neighbor_ranks[n] = n;
+    }
+    auto export_ranks = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ranks_host );
+
+    // Create the plan
+    if ( use_topology )
+        comm_plan.createFromExportsAndNeighbors( export_ranks, neighbor_ranks );
+    else
+        comm_plan.createFromExports( export_ranks );
+
+    // Check the plan. Note that if we are sending to ourselves (which we are)
+    // that then that data is listed as the first neighbor.
+    EXPECT_EQ( comm_plan.numNeighbor(), my_size );
+    EXPECT_EQ( comm_plan.totalNumExport(), num_data );
+    EXPECT_EQ( comm_plan.totalNumImport(), num_data );
+
+    // self send
+    EXPECT_EQ( comm_plan.neighborRank(0), my_rank );
+    EXPECT_EQ( comm_plan.numExport(0), 2 );
+    EXPECT_EQ( comm_plan.numImport(0), 2 );
+
+    // others
+    for ( int n = 1; n < my_size; ++n )
+    {
+        // the algorithm will swap this rank and the first one.
+        if ( n == my_rank )
+            EXPECT_EQ( comm_plan.neighborRank(n), 0 );
+        else
+            EXPECT_EQ( comm_plan.neighborRank(n), n );
+
+        EXPECT_EQ( comm_plan.numExport(n), 2 );
+        EXPECT_EQ( comm_plan.numImport(n), 2 );
+    }
+
+    // Create the export steering vector.
+    comm_plan.createSteering( export_ranks );
+
+    // Check the steering vector. The algorithm will pack the ids according to
+    // send rank and self sends will appear first.
+    auto steering = comm_plan.getExportSteering();
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), steering );
+    EXPECT_EQ( host_steering.size(), num_data );
+
+    // self sends - we don't know which order the self sends are in but we
+    // know they are the first 2.
+    if ( my_rank == host_steering(0) )
+    {
+        EXPECT_EQ( host_steering(0), my_rank );
+        EXPECT_EQ( host_steering(1), my_rank + my_size );
+    }
+    else
+    {
+        EXPECT_EQ( host_steering(1), my_rank );
+        EXPECT_EQ( host_steering(0), my_rank + my_size );
+    }
+
+    // others. again, we don't know which order the vector was made in but we
+    // do know they are grouped by the rank to which we are sending and we
+    // know how those ranks are ordered.
+    for ( int n = 1; n < my_size; ++n )
+    {
+        if ( n == my_rank )
+        {
+            if ( 0 == host_steering(2*n) )
+            {
+                EXPECT_EQ( host_steering(2*n), 0 );
+                EXPECT_EQ( host_steering(2*n+1), my_size );
+            }
+            else
+            {
+                EXPECT_EQ( host_steering(2*n+1), 0 );
+                EXPECT_EQ( host_steering(2*n), my_size );
+            }
+        }
+        else
+        {
+            if ( n == host_steering(2*n) )
+            {
+                EXPECT_EQ( host_steering(2*n), n );
+                EXPECT_EQ( host_steering(2*n+1), n + my_size );
+            }
+            else
+            {
+                EXPECT_EQ( host_steering(2*n+1), n );
+                EXPECT_EQ( host_steering(2*n), n + my_size );
+            }
+        }
+    }
+}
+
+//---------------------------------------------------------------------------//
+void test5( const bool use_topology )
+{
+    // Make a communication plan.
+    CommPlanTester comm_plan( MPI_COMM_WORLD );
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get my size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Every rank will communicate with all other ranks. Interleave the sends
+    // and only send every other value.
+    int num_data = 2 * my_size;
+    Kokkos::View<int*,Kokkos::HostSpace> export_ranks_host(
+        "export_ranks", num_data );
+    std::vector<int> neighbor_ranks( my_size );
+    for ( int n = 0; n < my_size; ++n )
+    {
+        export_ranks_host[n] = -1;
+        export_ranks_host[n + my_size] = n;
+        neighbor_ranks[n] = n;
+    }
+    auto export_ranks = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ranks_host );
+
+    // Create the plan
+    if ( use_topology )
+        comm_plan.createFromExportsAndNeighbors( export_ranks, neighbor_ranks );
+    else
+        comm_plan.createFromExports( export_ranks );
+
+    // Check the plan. Note that if we are sending to ourselves (which we are)
+    // that then that data is listed as the first neighbor.
+    EXPECT_EQ( comm_plan.numNeighbor(), my_size );
+    EXPECT_EQ( comm_plan.totalNumExport(), num_data / 2);
+    EXPECT_EQ( comm_plan.totalNumImport(), num_data / 2);
+
+    // self send
+    EXPECT_EQ( comm_plan.neighborRank(0), my_rank );
+    EXPECT_EQ( comm_plan.numExport(0), 1 );
+    EXPECT_EQ( comm_plan.numImport(0), 1 );
+
+    // others
+    for ( int n = 1; n < my_size; ++n )
+    {
+        // the algorithm will swap this rank and the first one.
+        if ( n == my_rank )
+            EXPECT_EQ( comm_plan.neighborRank(n), 0 );
+        else
+            EXPECT_EQ( comm_plan.neighborRank(n), n );
+
+        EXPECT_EQ( comm_plan.numExport(n), 1 );
+        EXPECT_EQ( comm_plan.numImport(n), 1 );
+    }
+
+    // Create the export steering vector.
+    comm_plan.createSteering( export_ranks );
+
+    // Check the steering vector. The algorithm will pack the ids according to
+    // send rank and self sends will appear first.
+    auto steering = comm_plan.getExportSteering();
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), steering );
+    EXPECT_EQ( host_steering.size(), my_size );
+
+    // self sends
+    EXPECT_EQ( host_steering(0), my_rank + my_size );
+
+    // others
+    for ( int n = 1; n < my_size; ++n )
+    {
+        if ( n == my_rank )
+            EXPECT_EQ( host_steering(n), my_size );
+        else
+            EXPECT_EQ( host_steering(n), n + my_size );
+    }
+}
+
+//---------------------------------------------------------------------------//
+void test6( const bool use_topology )
+{
+    // Make a communication plan.
+    CommPlanTester comm_plan( MPI_COMM_WORLD );
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get the comm size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Every has one element and will send that element to rank 0.
+    int num_data = 1;
+    Kokkos::View<int*,TEST_MEMSPACE>
+        export_ranks( "export_ranks", num_data );
+    Kokkos::deep_copy( export_ranks, 0 );
+    std::vector<int> neighbor_ranks;
+    if ( 0 == my_rank )
+    {
+        neighbor_ranks.resize( my_size );
+        std::iota( neighbor_ranks.begin(), neighbor_ranks.end(), 0 );
+    }
+    else
+    {
+        neighbor_ranks.assign( 1, 0 );
+    }
+
+    // Create the plan.
+    if ( use_topology )
+        comm_plan.createFromExportsAndNeighbors( export_ranks, neighbor_ranks );
+    else
+        comm_plan.createFromExports( export_ranks );
+
+    // Check the plan.
+    if ( 0 == my_rank )
+    {
+        EXPECT_EQ( comm_plan.numNeighbor(), my_size );
+        EXPECT_EQ( comm_plan.numImport(0), 1 );
+        EXPECT_EQ( comm_plan.totalNumImport(), my_size );
+    }
+    else
+    {
+        EXPECT_EQ( comm_plan.numNeighbor(), 1 );
+        EXPECT_EQ( comm_plan.neighborRank(0), 0 );
+        EXPECT_EQ( comm_plan.numImport(0), 0 );
+        EXPECT_EQ( comm_plan.totalNumImport(), 0 );
+    }
+    EXPECT_EQ( comm_plan.numExport(0), num_data );
+    EXPECT_EQ( comm_plan.totalNumExport(), num_data );
+
+    // Create the export steering vector.
+    comm_plan.createSteering( export_ranks );
+
+    // Check the steering vector. We thread the creation of the steering
+    // vector so we don't really know what order it is in - only that it is
+    // grouped by the ranks to which we are exporting. In this case just sort
+    // the steering vector and make sure all of the ids are there. We can do
+    // this because we are only sending to one rank.
+    auto steering = comm_plan.getExportSteering();
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), steering );
+    EXPECT_EQ( host_steering.size(), num_data );
+    EXPECT_EQ( 0, host_steering(0) );
+}
+
+//---------------------------------------------------------------------------//
+// RUN TESTS
+//---------------------------------------------------------------------------//
+TEST_F( TEST_CATEGORY, comm_plan_test_1 )
+{ test1(true); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_2 )
+{ test2(true); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_3 )
+{ test3(true); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_4 )
+{ test4(true); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_5 )
+{ test5(true); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_6 )
+{ test6(true); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_1_no_topo )
+{ test1(false); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_2_no_topo )
+{ test2(false); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_3_no_topo )
+{ test3(false); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_4_no_topo )
+{ test4(false); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_5_no_topo )
+{ test5(false); }
+
+TEST_F( TEST_CATEGORY, comm_plan_test_6_no_topo )
+{ test6(false); }
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Test

From 77cd5e8353f86d0eefbefda045be7a607a47946c Mon Sep 17 00:00:00 2001
From: Stuart Slattery <slatterysr@ornl.gov>
Date: Fri, 21 Dec 2018 12:47:59 -0500
Subject: [PATCH 2/3] adding distributor communication plan for data migration

Co-authored-by: dalg24 <dalg24@users.noreply.github.com>
---
 core/src/Cabana_Distributor.hpp               | 610 ++++++++++++++++
 core/unit_test/Cuda/tstDistributor_Cuda.cpp   |  13 +
 .../OpenMP/tstDistributor_OpenMP.cpp          |  13 +
 .../Pthread/tstDistributor_Pthread.cpp        |  13 +
 .../Serial/tstDistributor_Serial.cpp          |  13 +
 core/unit_test/tstDistributor.hpp             | 676 ++++++++++++++++++
 6 files changed, 1338 insertions(+)
 create mode 100644 core/src/Cabana_Distributor.hpp
 create mode 100644 core/unit_test/Cuda/tstDistributor_Cuda.cpp
 create mode 100644 core/unit_test/OpenMP/tstDistributor_OpenMP.cpp
 create mode 100644 core/unit_test/Pthread/tstDistributor_Pthread.cpp
 create mode 100644 core/unit_test/Serial/tstDistributor_Serial.cpp
 create mode 100644 core/unit_test/tstDistributor.hpp

diff --git a/core/src/Cabana_Distributor.hpp b/core/src/Cabana_Distributor.hpp
new file mode 100644
index 000000000..25e82d9bb
--- /dev/null
+++ b/core/src/Cabana_Distributor.hpp
@@ -0,0 +1,610 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#ifndef CABANA_DISTRIBUTOR_HPP
+#define CABANA_DISTRIBUTOR_HPP
+
+#include <Cabana_CommunicationPlan.hpp>
+#include <Cabana_AoSoA.hpp>
+#include <Cabana_Slice.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <mpi.h>
+
+#include <vector>
+#include <exception>
+#include <cassert>
+
+namespace Cabana
+{
+//---------------------------------------------------------------------------//
+/*!
+  \class Distributor
+
+  \brief Distributor is a communication plan for migrating data from one
+  uniquely-owned decomposition to another uniquely owned decomposition.
+
+  \tparam MemorySpace Memory space in which the data for this class will be
+  allocated.
+
+  The Distributor allows data to be migrated to an entirely new
+  decomposition. Only uniquely-owned decompositions are handled (i.e. each
+  local element in the source rank has a single unique destination rank).
+
+  Some nomenclature:
+
+  Export - the data we uniquely own that we will be sending to other ranks.
+
+  Import - the data we uniquely own that we will be receiving from other
+  ranks.
+
+  \note We can migrate data to the same rank. In this case a copy will occur
+  instead of communication.
+
+  \note To get the number of elements this rank will be receiving from
+  migration in the forward communication plan, call totalNumImport() on the
+  distributor. This will be needed when in-place migration is not used and a
+  user must allocate their own destination data structure.
+
+*/
+template<class MemorySpace>
+class Distributor : public CommunicationPlan<MemorySpace>
+{
+  public:
+
+    /*!
+      \brief Topology and export rank constructor. Use this when you already
+      know which ranks neighbor each other (i.e. every rank already knows who
+      they will be sending and receiving from) as it will be more
+      efficient. In this case you already know the topology of the
+      point-to-point communication but not how much data to send to and
+      receive from the neighbors.
+
+      \tparam ViewType The container type for the export element ranks. This
+      container type can be either a Kokkos View or a Cabana Slice.
+
+      \param comm The MPI communicator over which the distributor is defined.
+
+      \param element_export_ranks The destination rank in the target
+      decomposition of each locally owned element in the source
+      decomposition. Each element will have one unique destination to which it
+      will be exported. This export rank may be any one of the listed neighbor
+      ranks which can include the calling rank. An export rank of -1 will
+      signal that this element is *not* to be exported and will be ignored in
+      the data migration. The input is expected to be a Kokkos view or Cabana
+      slice in the same memory space as the distributor.
+
+      \param neighbor_ranks List of ranks this rank will send to and receive
+      from. This list can include the calling rank. This is effectively a
+      description of the topology of the point-to-point communication plan.
+
+      \param mpi_tag The MPI tag to use for non-blocking communication in the
+      communication plan generation.
+
+      \note For elements that you do not wish to export, use an export rank of
+      -1 to signal that this element is *not* to be exported and will be
+      ignored in the data migration. In other words, this element will be
+      *completely* removed in the new decomposition. If the data is staying on
+      this rank, just use this rank as the export destination and the data
+      will be efficiently migrated.
+    */
+    template<class ViewType>
+    Distributor( MPI_Comm comm,
+                 const ViewType& element_export_ranks,
+                 const std::vector<int>& neighbor_ranks,
+                 const int mpi_tag = 1221 )
+        : CommunicationPlan<MemorySpace>( comm )
+    {
+        this->createFromExportsAndTopology(
+            element_export_ranks, neighbor_ranks, mpi_tag );
+        this->createExportSteering( element_export_ranks );
+    }
+
+    /*!
+      \brief Export rank constructor. Use this when you don't know who you
+      will be receiving from - only who you are sending to. This is less
+      efficient than if we already knew who our neighbors were because we have
+      to determine the topology of the point-to-point communication first.
+
+      \tparam ViewType The container type for the export element ranks. This
+      container type can be either a Kokkos View or a Cabana Slice.
+
+      \param comm The MPI communicator over which the distributor is defined.
+
+      \param element_export_ranks The destination rank in the target
+      decomposition of each locally owned element in the source
+      decomposition. Each element will have one unique destination to which it
+      will be exported. This export rank may any one of the listed neighbor
+      ranks which can include the calling rank. An export rank of -1 will
+      signal that this element is *not* to be exported and will be ignored in
+      the data migration. The input is expected to be a Kokkos view or Cabana
+      slice in the same memory space as the distributor.
+
+      \param mpi_tag The MPI tag to use for non-blocking communication in the
+      communication plan generation.
+
+      \note For elements that you do not wish to export, use an export rank of
+      -1 to signal that this element is *not* to be exported and will be
+      ignored in the data migration. In other words, this element will be
+      *completely* removed in the new decomposition. If the data is staying on
+      this rank, just use this rank as the export destination and the data
+      will be efficiently migrated.
+    */
+    template<class ViewType>
+    Distributor( MPI_Comm comm,
+                 const ViewType& element_export_ranks,
+                 const int mpi_tag = 1221 )
+        : CommunicationPlan<MemorySpace>( comm )
+    {
+        this->createFromExportsOnly( element_export_ranks, mpi_tag );
+        this->createExportSteering( element_export_ranks );
+    }
+};
+
+//---------------------------------------------------------------------------//
+// Static type checker.
+template<typename >
+struct is_distributor : public std::false_type {};
+
+template<typename MemorySpace>
+struct is_distributor<Distributor<MemorySpace> >
+    : public std::true_type {};
+
+template<typename MemorySpace>
+struct is_distributor<const Distributor<MemorySpace> >
+    : public std::true_type {};
+
+//---------------------------------------------------------------------------//
+namespace Impl
+{
+
+//---------------------------------------------------------------------------//
+// Synchronously move data between a source and destination AoSoA by executing
+// the forward communication plan.
+template<class Distributor_t, class AoSoA_t>
+void distributeData(
+    const Distributor_t& distributor,
+    const AoSoA_t& src,
+    AoSoA_t& dst,
+    int mpi_tag,
+    typename std::enable_if<(is_distributor<Distributor_t>::value &&
+                             is_aosoa<AoSoA_t>::value),
+    int>::type * = 0 )
+{
+    // Get the MPI rank we are currently on.
+    int my_rank = -1;
+    MPI_Comm_rank( distributor.comm(), &my_rank );
+
+    // Get the number of neighbors.
+    int num_n = distributor.numNeighbor();
+
+    // Calculate the number of elements that are staying on this rank and
+    // therefore can be directly copied. If any of the neighbor ranks are this
+    // rank it will be stored in first position (i.e. the first neighbor in
+    // the local list is always yourself if you are sending to yourself).
+    std::size_t num_stay = ( distributor.neighborRank(0) == my_rank )
+                           ? distributor.numExport(0) : 0;
+
+    // Allocate a send buffer.
+    std::size_t num_send = distributor.totalNumExport() - num_stay;
+    Kokkos::View<typename AoSoA_t::tuple_type*,
+                 typename Distributor_t::memory_space>
+        send_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("distributor_send_buffer"),
+            num_send );
+
+    // Allocate a receive buffer.
+    Kokkos::View<typename AoSoA_t::tuple_type*,
+                 typename Distributor_t::memory_space>
+        recv_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("distributor_recv_buffer"),
+            distributor.totalNumImport() );
+
+    // Get the steering vector for the sends.
+    auto steering = distributor.getExportSteering();
+
+    // Gather the exports from the source AoSoA into the tuple-contiguous send
+    // buffer or the receive buffer if the data is staying. We know that the
+    // steering vector is ordered such that the data staying on this rank
+    // comes first.
+    auto build_send_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        {
+            auto tpl = src.getTuple( steering(i) );
+            if ( i < num_stay )
+                recv_buffer( i ) = tpl;
+            else
+                send_buffer( i - num_stay ) = tpl;
+        };
+    Kokkos::RangePolicy<typename Distributor_t::execution_space>
+        build_send_buffer_policy( 0, distributor.totalNumExport() );
+    Kokkos::parallel_for( "Cabana::Impl::distributeData::build_send_buffer",
+                          build_send_buffer_policy,
+                          build_send_buffer_func );
+    Kokkos::fence();
+
+    // Post non-blocking receives.
+    std::vector<MPI_Request> requests;
+    requests.reserve( num_n );
+    std::pair<std::size_t,std::size_t> recv_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        recv_range.second = recv_range.first + distributor.numImport(n);
+
+        if ( (distributor.numImport(n) > 0) &&
+             (distributor.neighborRank(n) != my_rank) )
+        {
+            auto recv_subview = Kokkos::subview( recv_buffer, recv_range );
+
+            requests.push_back( MPI_Request() );
+
+            MPI_Irecv( recv_subview.data(),
+                       recv_subview.size() * sizeof(typename AoSoA_t::tuple_type),
+                       MPI_BYTE,
+                       distributor.neighborRank(n),
+                       mpi_tag,
+                       distributor.comm(),
+                       &(requests.back()) );
+        }
+
+        recv_range.first = recv_range.second;
+    }
+
+    // Do blocking sends.
+    std::pair<std::size_t,std::size_t> send_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        if ( (distributor.numExport(n) > 0) &&
+             (distributor.neighborRank(n) != my_rank) )
+        {
+            send_range.second = send_range.first + distributor.numExport(n);
+
+            auto send_subview = Kokkos::subview( send_buffer, send_range );
+
+            MPI_Send( send_subview.data(),
+                      send_subview.size() * sizeof(typename AoSoA_t::tuple_type),
+                      MPI_BYTE,
+                      distributor.neighborRank(n),
+                      mpi_tag,
+                      distributor.comm() );
+
+            send_range.first = send_range.second;
+        }
+    }
+
+    // Wait on non-blocking receives.
+    std::vector<MPI_Status> status( requests.size() );
+    const int ec =
+        MPI_Waitall( requests.size(), requests.data(), status.data() );
+    assert( MPI_SUCCESS == ec );
+
+    // Extract the receive buffer into the destination AoSoA.
+    auto extract_recv_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        { dst.setTuple( i, recv_buffer(i) ); };
+    Kokkos::RangePolicy<typename Distributor_t::execution_space>
+        extract_recv_buffer_policy( 0, distributor.totalNumImport() );
+    Kokkos::parallel_for( "Cabana::Impl::distributeData::extract_recv_buffer",
+                          extract_recv_buffer_policy,
+                          extract_recv_buffer_func );
+    Kokkos::fence();
+
+    // Barrier before completing to ensure synchronization.
+    MPI_Barrier( distributor.comm() );
+}
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Impl
+
+//---------------------------------------------------------------------------//
+/*!
+  \brief Synchronously migrate data between two different decompositions using
+  the distributor forward communication plan. Multiple AoSoA version.
+
+  Migrate moves all data to a new distribution that is uniquely owned - each
+  element will only have a single destination rank.
+
+  \tparam Distributor_t Distributor type - must be a distributor.
+
+  \tparam AoSoA_t AoSoA type - must be an AoSoA.
+
+  \param distributor The distributor to use for the migration.
+
+  \param src The AoSoA containing the data to be migrated. Must have the same
+  number of elements as the inputs used to construct the distributor.
+
+  \param dst The AoSoA to which the migrated data will be written. Must be the
+  same size as the number of imports given by the distributor on this
+  rank. Call totalNumImport() on the distributor to get this size value.
+
+  \param mpi_tag The MPI tag to use for non-blocking communication in the data
+  migration. Note here that if multiple instances of this function are being
+  called at once then different tags should be used in each function call to
+  avoid any communication conflicts.
+*/
+template<class Distributor_t, class AoSoA_t>
+void migrate( const Distributor_t& distributor,
+              const AoSoA_t& src,
+              AoSoA_t& dst,
+              int mpi_tag = 1001,
+              typename std::enable_if<(is_distributor<Distributor_t>::value &&
+                                       is_aosoa<AoSoA_t>::value),
+              int>::type * = 0 )
+{
+    // Check that src and dst are the right size.
+    if ( src.size() != distributor.exportSize() )
+        throw std::runtime_error("Source is the wrong size for migration!");
+    if ( dst.size() != distributor.totalNumImport() )
+        throw std::runtime_error("Destination is the wrong size for migration!");
+
+    // Move the data.
+    Impl::distributeData( distributor, src, dst, mpi_tag );
+}
+
+//---------------------------------------------------------------------------//
+/*!
+  \brief Synchronously migrate data between two different decompositions using
+  the distributor forward communication plan. Single AoSoA version that will
+  resize in-place. Note that resizing does not necessarily allocate more
+  memory. The AoSoA memory will only increase if not enough has already been
+  reserved/allocated for the needed number of elements.
+
+  Migrate moves all data to a new distribution that is uniquely owned - each
+  element will only have a single destination rank.
+
+  \tparam Distributor_t Distributor type - must be a distributor.
+
+  \tparam AoSoA_t AoSoA type - must be an AoSoA.
+
+  \param distributor The distributor to use for the migration.
+
+  \param AoSoA The AoSoA containing the data to be migrated. Upon input, must
+  have the same number of elements as the inputs used to construct the
+  destributor. At output, it will be the same size as th enumber of import
+  elements on this rank provided by the distributor. Before using this
+  function, consider reserving enough memory in the data structure so
+  reallocating is not necessary.
+
+  \param mpi_tag The MPI tag to use for non-blocking communication in the data
+  migration. Note here that if multiple instances of this function are being
+  called at once then different tags should be used in each function call to
+  avoid any communication conflicts.
+*/
+template<class Distributor_t, class AoSoA_t>
+void migrate( const Distributor_t& distributor,
+              AoSoA_t& aosoa,
+              int mpi_tag = 1001,
+              typename std::enable_if<(is_distributor<Distributor_t>::value &&
+                                       is_aosoa<AoSoA_t>::value),
+              int>::type * = 0 )
+{
+    // Check that the AoSoA is the right size.
+    if ( aosoa.size() != distributor.exportSize() )
+        throw std::runtime_error("AoSoA is the wrong size for migration!");
+
+    // Determine if the source of destination decomposition has more data on
+    // this rank.
+    bool dst_is_bigger =
+        ( distributor.totalNumImport() > distributor.exportSize() );
+
+    // If the destination decomposition is bigger than the source
+    // decomposition resize now so we have enough space to do the operation.
+    if ( dst_is_bigger )
+        aosoa.resize( distributor.totalNumImport() );
+
+    // Move the data.
+    Impl::distributeData( distributor, aosoa, aosoa, mpi_tag );
+
+    // If the destination decomposition is smaller than the source
+    // decomposition resize after we have moved the data.
+    if ( !dst_is_bigger )
+        aosoa.resize( distributor.totalNumImport() );
+}
+
+//---------------------------------------------------------------------------//
+/*!
+  \brief Synchronously migrate data between two different decompositions using
+  the distributor forward communication plan. Slice version. The user can do
+  this in-place with the same slice but they will need to manage the resizing
+  themselves as we can't resize slices.
+
+  Migrate moves all data to a new distribution that is uniquely owned - each
+  element will only have a single destination rank.
+
+  \tparam Distributor_t Distributor type - must be a distributor.
+
+  \tparam Slice_t Slice type - must be an Slice.
+
+  \param distributor The distributor to use for the migration.
+
+  \param src The slice containing the data to be migrated. Must have the same
+  number of elements as the inputs used to construct the destributor.
+
+  \param dst The slice to which the migrated data will be written. Must be the
+  same size as the number of imports given by the distributor on this
+  rank. Call totalNumImport() on the distributor to get this size value.
+
+  \param mpi_tag The MPI tag to use for non-blocking communication in the data
+  migration. Note here that if multiple instances of this function are being
+  called at once then different tags should be used in each function call to
+  avoid any communication conflicts.
+*/
+template<class Distributor_t, class Slice_t>
+void migrate( const Distributor_t& distributor,
+              const Slice_t& src,
+              Slice_t& dst,
+              int mpi_tag = 1001,
+              typename std::enable_if<(is_distributor<Distributor_t>::value &&
+                                       is_slice<Slice_t>::value),
+              int>::type * = 0 )
+{
+    // Check that src and dst are the right size.
+    if ( src.size() != distributor.exportSize() )
+        throw std::runtime_error("Source is the wrong size for migration!");
+    if ( dst.size() != distributor.totalNumImport() )
+        throw std::runtime_error("Destination is the wrong size for migration!");
+
+    // Get the number of components in the slices.
+    int num_comp = 1;
+    for ( int d = 2; d < src.rank(); ++d )
+        num_comp *= src.extent(d);
+
+    // Get the raw slice data.
+    auto src_data = src.data();
+    auto dst_data = dst.data();
+
+    // Get the MPI rank we are currently on.
+    int my_rank = -1;
+    MPI_Comm_rank( distributor.comm(), &my_rank );
+
+    // Get the number of neighbors.
+    int num_n = distributor.numNeighbor();
+
+    // Calculate the number of elements that are staying on this rank and
+    // therefore can be directly copied. If any of the neighbor ranks are this
+    // rank it will be stored in first position (i.e. the first neighbor in
+    // the local list is always yourself if you are sending to yourself).
+    std::size_t num_stay = ( distributor.neighborRank(0) == my_rank )
+                           ? distributor.numExport(0) : 0;
+
+    // Allocate a send buffer. Note this one is layout right so the components
+    // of each element are consecutive in memory.
+    std::size_t num_send = distributor.totalNumExport() - num_stay;
+    Kokkos::View<typename Slice_t::value_type**,
+                 Kokkos::LayoutRight,
+                 typename Distributor_t::memory_space>
+        send_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("distributor_send_buffer"),
+            num_send, num_comp );
+
+    // Allocate a receive buffer. Note this one is layout right so the components
+    // of each element are consecutive in memory.
+    Kokkos::View<typename Slice_t::value_type**,
+                 Kokkos::LayoutRight,
+                 typename Distributor_t::memory_space>
+        recv_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("distributor_recv_buffer"),
+            distributor.totalNumImport(), num_comp );
+
+    // Get the steering vector for the sends.
+    auto steering = distributor.getExportSteering();
+
+    // Gather from the source Slice into the contiguous send buffer or,
+    // if it is part of the local copy, put it directly in the destination
+    // Slice.
+    auto build_send_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        {
+            auto s_src = Slice_t::index_type::s( steering(i) );
+            auto a_src = Slice_t::index_type::a( steering(i) );
+            std::size_t src_offset = s_src*src.stride(0) + a_src;
+            if ( i < num_stay )
+                for ( int n = 0; n < num_comp; ++n )
+                    recv_buffer( i, n ) =
+                        src_data[ src_offset + n * Slice_t::vector_length ];
+            else
+                for ( int n = 0; n < num_comp; ++n )
+                    send_buffer( i - num_stay, n ) =
+                        src_data[ src_offset + n * Slice_t::vector_length ];
+        };
+    Kokkos::RangePolicy<typename Distributor_t::execution_space>
+        build_send_buffer_policy( 0, distributor.totalNumExport() );
+    Kokkos::parallel_for( "Cabana::migrate::build_send_buffer",
+                          build_send_buffer_policy,
+                          build_send_buffer_func );
+    Kokkos::fence();
+
+    // Post non-blocking receives.
+    std::vector<MPI_Request> requests;
+    requests.reserve( num_n );
+    std::pair<std::size_t,std::size_t> recv_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        recv_range.second = recv_range.first + distributor.numImport(n);
+
+        if ( (distributor.numImport(n) > 0) &&
+             (distributor.neighborRank(n) != my_rank) )
+        {
+            auto recv_subview = Kokkos::subview(
+                recv_buffer, recv_range, Kokkos::ALL );
+
+            requests.push_back( MPI_Request() );
+
+            MPI_Irecv( recv_subview.data(),
+                       recv_subview.size() * sizeof(typename Slice_t::value_type),
+                       MPI_BYTE,
+                       distributor.neighborRank(n),
+                       mpi_tag,
+                       distributor.comm(),
+                       &(requests.back()) );
+        }
+
+        recv_range.first = recv_range.second;
+    }
+
+    // Do blocking sends.
+    std::pair<std::size_t,std::size_t> send_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        if ( (distributor.numExport(n) > 0) &&
+             (distributor.neighborRank(n) != my_rank) )
+        {
+            send_range.second = send_range.first + distributor.numExport(n);
+
+            auto send_subview = Kokkos::subview(
+                send_buffer, send_range, Kokkos::ALL );
+
+            MPI_Send( send_subview.data(),
+                      send_subview.size() * sizeof(typename Slice_t::value_type),
+                      MPI_BYTE,
+                      distributor.neighborRank(n),
+                      mpi_tag,
+                      distributor.comm() );
+
+            send_range.first = send_range.second;
+        }
+    }
+
+    // Wait on non-blocking receives.
+    std::vector<MPI_Status> status( requests.size() );
+    const int ec =
+        MPI_Waitall( requests.size(), requests.data(), status.data() );
+    assert( MPI_SUCCESS == ec );
+
+    // Extract the data from the receive buffer into the destination Slice.
+    auto extract_recv_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        {
+            auto s = Slice_t::index_type::s( i );
+            auto a = Slice_t::index_type::a( i );
+            std::size_t dst_offset = s*dst.stride(0) + a;
+            for ( int n = 0; n < num_comp; ++n )
+                dst_data[ dst_offset + n * Slice_t::vector_length ] =
+                    recv_buffer( i, n );
+        };
+    Kokkos::RangePolicy<typename Distributor_t::execution_space>
+        extract_recv_buffer_policy( 0, distributor.totalNumImport() );
+    Kokkos::parallel_for( "Cabana::migrate::extract_recv_buffer",
+                          extract_recv_buffer_policy,
+                          extract_recv_buffer_func );
+    Kokkos::fence();
+
+    // Barrier before completing to ensure synchronization.
+    MPI_Barrier( distributor.comm() );
+}
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Cabana
+
+#endif // end CABANA_DISTRIBUTOR_HPP
diff --git a/core/unit_test/Cuda/tstDistributor_Cuda.cpp b/core/unit_test/Cuda/tstDistributor_Cuda.cpp
new file mode 100644
index 000000000..bbc92c129
--- /dev/null
+++ b/core/unit_test/Cuda/tstDistributor_Cuda.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Cuda/TestCuda_Category.hpp>
+#include <tstDistributor.hpp>
diff --git a/core/unit_test/OpenMP/tstDistributor_OpenMP.cpp b/core/unit_test/OpenMP/tstDistributor_OpenMP.cpp
new file mode 100644
index 000000000..42285bd63
--- /dev/null
+++ b/core/unit_test/OpenMP/tstDistributor_OpenMP.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <OpenMP/TestOpenMP_Category.hpp>
+#include <tstDistributor.hpp>
diff --git a/core/unit_test/Pthread/tstDistributor_Pthread.cpp b/core/unit_test/Pthread/tstDistributor_Pthread.cpp
new file mode 100644
index 000000000..e74cc6309
--- /dev/null
+++ b/core/unit_test/Pthread/tstDistributor_Pthread.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Pthread/TestPthread_Category.hpp>
+#include <tstDistributor.hpp>
diff --git a/core/unit_test/Serial/tstDistributor_Serial.cpp b/core/unit_test/Serial/tstDistributor_Serial.cpp
new file mode 100644
index 000000000..b2cdb0609
--- /dev/null
+++ b/core/unit_test/Serial/tstDistributor_Serial.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Serial/TestSerial_Category.hpp>
+#include <tstDistributor.hpp>
diff --git a/core/unit_test/tstDistributor.hpp b/core/unit_test/tstDistributor.hpp
new file mode 100644
index 000000000..08798e2f8
--- /dev/null
+++ b/core/unit_test/tstDistributor.hpp
@@ -0,0 +1,676 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Cabana_Distributor.hpp>
+#include <Cabana_AoSoA.hpp>
+#include <Cabana_Parallel.hpp>
+#include <Cabana_DeepCopy.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <gtest/gtest.h>
+
+#include <mpi.h>
+
+#include <vector>
+#include <memory>
+#include <algorithm>
+
+namespace Test
+{
+
+//---------------------------------------------------------------------------//
+void test1( const bool use_topology )
+{
+    // Make a communication plan.
+    std::shared_ptr<Cabana::Distributor<TEST_MEMSPACE> > distributor;
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Every rank will communicate with itself and send all of its data.
+    int num_data = 10;
+    Kokkos::View<int*,TEST_MEMSPACE>
+        export_ranks( "export_ranks", num_data );
+    Kokkos::deep_copy( export_ranks, my_rank );
+    std::vector<int> neighbor_ranks( 1, my_rank );
+
+    // Create the plan.
+    if ( use_topology )
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks, neighbor_ranks );
+    else
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks );
+
+    // Make some data to migrate.
+    using DataTypes = Cabana::MemberTypes<int,double[2]>;
+    using AoSoA_t = Cabana::AoSoA<DataTypes,TEST_MEMSPACE>;
+    AoSoA_t data_src( num_data );
+    auto slice_int_src = data_src.slice<0>();
+    auto slice_dbl_src = data_src.slice<1>();
+
+    // Fill the data.
+    auto fill_func =
+        KOKKOS_LAMBDA( const int i )
+        {
+            slice_int_src(i) = my_rank + i;
+            slice_dbl_src(i,0) = my_rank + i;
+            slice_dbl_src(i,1) = my_rank + i + 0.5;
+        };
+    Kokkos::RangePolicy<TEST_EXECSPACE> range_policy( 0, num_data );
+    Kokkos::parallel_for( range_policy, fill_func );
+    Kokkos::fence();
+
+    // Create a second set of data to which we will migrate.
+    AoSoA_t data_dst( num_data );
+    auto slice_int_dst = data_dst.slice<0>();
+    auto slice_dbl_dst = data_dst.slice<1>();
+
+    // Do the migration
+    Cabana::migrate( *distributor, data_src, data_dst );
+
+    // Check the migration.
+    Cabana::AoSoA<DataTypes,Cabana::HostSpace> data_dst_host( num_data );
+    auto slice_int_dst_host = data_dst_host.slice<0>();
+    auto slice_dbl_dst_host = data_dst_host.slice<1>();
+    Cabana::deep_copy( data_dst_host, data_dst );
+    auto steering = distributor->getExportSteering();
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), steering );
+    for ( int i = 0; i < num_data; ++i )
+    {
+        EXPECT_EQ( slice_int_dst_host(i), my_rank + host_steering(i) );
+        EXPECT_EQ( slice_dbl_dst_host(i,0), my_rank + host_steering(i) );
+        EXPECT_EQ( slice_dbl_dst_host(i,1), my_rank + host_steering(i) + 0.5 );
+    }
+}
+
+//---------------------------------------------------------------------------//
+void test2( const bool use_topology )
+{
+    // Make a communication plan.
+    std::shared_ptr<Cabana::Distributor<TEST_MEMSPACE> > distributor;
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Every rank will communicate with itself and send every other piece of data.
+    int num_data = 10;
+    Kokkos::View<int*,Kokkos::HostSpace> export_ranks_host(
+        "export_ranks", num_data );
+    for ( int n = 0; n < num_data; ++n )
+        export_ranks_host(n) = ( 0 == n%2 ) ? my_rank : -1;
+    auto export_ranks = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ranks_host );
+    std::vector<int> neighbor_ranks( 1, my_rank );
+
+    // Create the plan
+    if ( use_topology )
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks, neighbor_ranks );
+    else
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks );
+
+    // Make some data to migrate.
+    using DataTypes = Cabana::MemberTypes<int,double[2]>;
+    using AoSoA_t = Cabana::AoSoA<DataTypes,TEST_MEMSPACE>;
+    AoSoA_t data( num_data );
+    auto slice_int = data.slice<0>();
+    auto slice_dbl = data.slice<1>();
+
+    // Fill the data.
+    auto fill_func =
+        KOKKOS_LAMBDA( const int i )
+        {
+            slice_int(i) = my_rank + i;
+            slice_dbl(i,0) = my_rank + i;
+            slice_dbl(i,1) = my_rank + i + 0.5;
+        };
+    Kokkos::RangePolicy<TEST_EXECSPACE>
+        range_policy( 0, num_data );
+    Kokkos::parallel_for( range_policy, fill_func );
+    Kokkos::fence();
+
+    // Do the migration in-place
+    Cabana::migrate( *distributor, data );
+
+    // Get host copies of the migrated data.
+    Cabana::AoSoA<DataTypes,Cabana::HostSpace> data_host( num_data / 2 );
+    auto slice_int_host = data_host.slice<0>();
+    auto slice_dbl_host = data_host.slice<1>();
+    Cabana::deep_copy( data_host, data );
+
+    // Check the migration. We received less than we sent so this should have
+    // resized the aososa.
+    auto steering = distributor->getExportSteering();
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), steering );
+    EXPECT_EQ( data.size(), num_data / 2 );
+    for ( int i = 0; i < num_data / 2; ++i )
+    {
+        EXPECT_EQ( slice_int_host(i), my_rank + host_steering(i) );
+        EXPECT_EQ( slice_dbl_host(i,0), my_rank + host_steering(i) );
+        EXPECT_EQ( slice_dbl_host(i,1), my_rank + host_steering(i) + 0.5 );
+    }
+}
+
+//---------------------------------------------------------------------------//
+void test3( const bool use_topology )
+{
+    // Make a communication plan.
+    std::shared_ptr<Cabana::Distributor<TEST_MEMSPACE> > distributor;
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get my size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Compute the inverse rank.
+    int inverse_rank = my_size - my_rank - 1;
+
+    // Every rank will communicate with the rank that is its inverse.
+    int num_data = 10;
+    Kokkos::View<int*,TEST_MEMSPACE>
+        export_ranks( "export_ranks", num_data );
+    Kokkos::deep_copy( export_ranks, inverse_rank );
+    std::vector<int> neighbor_ranks( 1, inverse_rank );
+
+    // Create the plan with both export ranks and the topology.
+    if ( use_topology )
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks, neighbor_ranks );
+    else
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks );
+
+    // Make some data to migrate.
+    using DataTypes = Cabana::MemberTypes<int,double[2]>;
+    using AoSoA_t = Cabana::AoSoA<DataTypes,TEST_MEMSPACE>;
+    AoSoA_t data_src( num_data );
+    auto slice_int_src = data_src.slice<0>();
+    auto slice_dbl_src = data_src.slice<1>();
+
+    // Fill the data.
+    auto fill_func =
+        KOKKOS_LAMBDA( const int i )
+        {
+            slice_int_src(i) = my_rank + i;
+            slice_dbl_src(i,0) = my_rank + i;
+            slice_dbl_src(i,1) = my_rank + i + 0.5;
+        };
+    Kokkos::RangePolicy<TEST_EXECSPACE>
+        range_policy( 0, num_data );
+    Kokkos::parallel_for( range_policy, fill_func );
+    Kokkos::fence();
+
+    // Create a second set of data to which we will migrate.
+    AoSoA_t data_dst( num_data );
+    auto slice_int_dst = data_dst.slice<0>();
+    auto slice_dbl_dst = data_dst.slice<1>();
+
+    // Do the migration with slices
+    Cabana::migrate( *distributor, slice_int_src, slice_int_dst );
+    Cabana::migrate( *distributor, slice_dbl_src, slice_dbl_dst );
+
+    // Exchange steering vectors with your inverse rank so we know what order
+    // they sent us stuff in. We thread the creation of the steering vector so
+    // its order is not deterministic.
+    auto my_steering = distributor->getExportSteering();
+    Kokkos::View<std::size_t*,TEST_MEMSPACE>
+        inverse_steering( "inv_steering", distributor->totalNumImport() );
+    int mpi_tag = 1030;
+    MPI_Request request;
+    MPI_Irecv( inverse_steering.data(), inverse_steering.size(),
+               MPI_UNSIGNED_LONG_LONG, inverse_rank, mpi_tag,
+               MPI_COMM_WORLD, &request );
+    MPI_Send( my_steering.data(), my_steering.size(),
+              MPI_UNSIGNED_LONG_LONG, inverse_rank, mpi_tag,
+              MPI_COMM_WORLD );
+    MPI_Status status;
+    MPI_Wait( &request, &status );
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), inverse_steering );
+
+    // Check the migration.
+    Cabana::AoSoA<DataTypes,Cabana::HostSpace> data_dst_host( num_data );
+    Cabana::deep_copy( data_dst_host, data_dst );
+    auto slice_int_dst_host = data_dst_host.slice<0>();
+    auto slice_dbl_dst_host = data_dst_host.slice<1>();
+    for ( int i = 0; i < num_data; ++i )
+    {
+        EXPECT_EQ( slice_int_dst_host(i), inverse_rank + host_steering(i) );
+        EXPECT_EQ( slice_dbl_dst_host(i,0), inverse_rank + host_steering(i) );
+        EXPECT_EQ( slice_dbl_dst_host(i,1), inverse_rank + host_steering(i) + 0.5 );
+    }
+}
+
+//---------------------------------------------------------------------------//
+void test4( const bool use_topology )
+{
+    // Make a communication plan.
+    std::shared_ptr<Cabana::Distributor<TEST_MEMSPACE> > distributor;
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get my size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Every rank will communicate with all other ranks. Interleave the sends.
+    int num_data = 2 * my_size;
+    Kokkos::View<int*,Kokkos::HostSpace> export_ranks_host(
+        "export_ranks", num_data );
+    std::vector<int> neighbor_ranks( my_size );
+    for ( int n = 0; n < my_size; ++n )
+    {
+        export_ranks_host[n] = n;
+        export_ranks_host[n + my_size] = n;
+        neighbor_ranks[n] = n;
+    }
+    auto export_ranks = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ranks_host );
+    for ( int n = 0; n < my_size; ++n )
+    {
+        export_ranks[n] = n;
+        export_ranks[n + my_size] = n;
+        neighbor_ranks[n] = n;
+    }
+
+    // Create the plan
+    if ( use_topology )
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks, neighbor_ranks );
+    else
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks );
+
+    // Make some data to migrate.
+    using DataTypes = Cabana::MemberTypes<int,double[2]>;
+    using AoSoA_t = Cabana::AoSoA<DataTypes,TEST_MEMSPACE>;
+    AoSoA_t data_src( num_data );
+    auto slice_int_src = data_src.slice<0>();
+    auto slice_dbl_src = data_src.slice<1>();
+
+    // Fill the data.
+    auto fill_func =
+        KOKKOS_LAMBDA( const int i )
+        {
+            slice_int_src(i) = my_rank;
+            slice_dbl_src(i,0) = my_rank;
+            slice_dbl_src(i,1) = my_rank + 0.5;
+        };
+    Kokkos::RangePolicy<TEST_EXECSPACE>
+        range_policy( 0, num_data );
+    Kokkos::parallel_for( range_policy, fill_func );
+    Kokkos::fence();
+
+    // Create a second set of data to which we will migrate.
+    AoSoA_t data_dst( num_data );
+    auto slice_int_dst = data_dst.slice<0>();
+    auto slice_dbl_dst = data_dst.slice<1>();
+
+    // Do the migration
+    Cabana::migrate( *distributor, data_src, data_dst );
+
+    // Check the migration.
+    Cabana::AoSoA<DataTypes,Cabana::HostSpace> data_dst_host( num_data );
+    auto slice_int_dst_host = data_dst_host.slice<0>();
+    auto slice_dbl_dst_host = data_dst_host.slice<1>();
+    Cabana::deep_copy( data_dst_host, data_dst );
+
+    // self sends
+    EXPECT_EQ( slice_int_dst_host(0), my_rank );
+    EXPECT_EQ( slice_dbl_dst_host(0,0), my_rank );
+    EXPECT_EQ( slice_dbl_dst_host(0,1), my_rank + 0.5 );
+
+    EXPECT_EQ( slice_int_dst_host(1), my_rank );
+    EXPECT_EQ( slice_dbl_dst_host(1,0), my_rank );
+    EXPECT_EQ( slice_dbl_dst_host(1,1), my_rank + 0.5 );
+
+    // others
+    for ( int i = 1; i < my_size; ++i )
+    {
+        if ( i == my_rank )
+        {
+            EXPECT_EQ( slice_int_dst_host(2*i), 0 );
+            EXPECT_EQ( slice_dbl_dst_host(2*i,0), 0 );
+            EXPECT_EQ( slice_dbl_dst_host(2*i,1), 0.5 );
+
+            EXPECT_EQ( slice_int_dst_host(2*i + 1), 0 );
+            EXPECT_EQ( slice_dbl_dst_host(2*i + 1,0), 0 );
+            EXPECT_EQ( slice_dbl_dst_host(2*i + 1,1), 0.5 );
+        }
+        else
+        {
+            EXPECT_EQ( slice_int_dst_host(2*i), i );
+            EXPECT_EQ( slice_dbl_dst_host(2*i,0), i );
+            EXPECT_EQ( slice_dbl_dst_host(2*i,1), i + 0.5 );
+
+            EXPECT_EQ( slice_int_dst_host(2*i + 1), i );
+            EXPECT_EQ( slice_dbl_dst_host(2*i + 1,0), i );
+            EXPECT_EQ( slice_dbl_dst_host(2*i + 1,1), i + 0.5 );
+        }
+    }
+}
+
+//---------------------------------------------------------------------------//
+void test5( const bool use_topology )
+{
+    // Make a communication plan.
+    std::shared_ptr<Cabana::Distributor<TEST_MEMSPACE> > distributor;
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get my size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Every rank will communicate with all other ranks. Interleave the sends
+    // and only send every other value.
+    int num_data = 2 * my_size;
+    Kokkos::View<int*,Kokkos::HostSpace> export_ranks_host(
+        "export_ranks", num_data );
+    std::vector<int> neighbor_ranks( my_size );
+    for ( int n = 0; n < my_size; ++n )
+    {
+        export_ranks_host[n] = -1;
+        export_ranks_host[n + my_size] = n;
+        neighbor_ranks[n] = n;
+    }
+    auto export_ranks = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ranks_host );
+
+    // Create the plan
+    if ( use_topology )
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks, neighbor_ranks );
+    else
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks );
+
+    // Make some data to migrate.
+    using DataTypes = Cabana::MemberTypes<int,double[2]>;
+    using AoSoA_t = Cabana::AoSoA<DataTypes,TEST_MEMSPACE>;
+    AoSoA_t data_src( num_data );
+    auto slice_int_src = data_src.slice<0>();
+    auto slice_dbl_src = data_src.slice<1>();
+
+    // Fill the data.
+    auto fill_func =
+        KOKKOS_LAMBDA( const int i )
+        {
+            slice_int_src(i) = my_rank;
+            slice_dbl_src(i,0) = my_rank;
+            slice_dbl_src(i,1) = my_rank + 0.5;
+        };
+    Kokkos::RangePolicy<TEST_EXECSPACE>
+        range_policy( 0, num_data );
+    Kokkos::parallel_for( range_policy, fill_func );
+    Kokkos::fence();
+
+    // Create a second set of data to which we will migrate.
+    AoSoA_t data_dst( my_size );
+    auto slice_int_dst = data_dst.slice<0>();
+    auto slice_dbl_dst = data_dst.slice<1>();
+
+    // Do the migration with slices
+    Cabana::migrate( *distributor, slice_int_src, slice_int_dst );
+    Cabana::migrate( *distributor, slice_dbl_src, slice_dbl_dst );
+
+    // Check the migration.
+    Cabana::AoSoA<DataTypes,Cabana::HostSpace> data_host( my_size );
+    auto slice_int_host = data_host.slice<0>();
+    auto slice_dbl_host = data_host.slice<1>();
+    Cabana::deep_copy( data_host, data_dst );
+
+    // self sends
+    EXPECT_EQ( slice_int_host(0), my_rank );
+    EXPECT_EQ( slice_dbl_host(0,0), my_rank );
+    EXPECT_EQ( slice_dbl_host(0,1), my_rank + 0.5 );
+
+    // others
+    for ( int i = 1; i < my_size; ++i )
+    {
+        if ( i == my_rank )
+        {
+            EXPECT_EQ( slice_int_host(i), 0 );
+            EXPECT_EQ( slice_dbl_host(i,0), 0 );
+            EXPECT_EQ( slice_dbl_host(i,1), 0.5 );
+        }
+        else
+        {
+            EXPECT_EQ( slice_int_host(i), i );
+            EXPECT_EQ( slice_dbl_host(i,0), i );
+            EXPECT_EQ( slice_dbl_host(i,1), i + 0.5 );
+        }
+    }
+}
+
+//---------------------------------------------------------------------------//
+void test6( const bool use_topology )
+{
+    // Make a communication plan.
+    std::shared_ptr<Cabana::Distributor<TEST_MEMSPACE> > distributor;
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get the comm size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Every has one element and will send that element to rank 0.
+    int num_data = 1;
+    Kokkos::View<int*,TEST_MEMSPACE>
+        export_ranks( "export_ranks", num_data );
+    Kokkos::deep_copy( export_ranks, 0 );
+    std::vector<int> neighbor_ranks;
+    if ( 0 == my_rank )
+    {
+        neighbor_ranks.resize( my_size );
+        std::iota( neighbor_ranks.begin(), neighbor_ranks.end(), 0 );
+    }
+    else
+    {
+        neighbor_ranks.assign( 1, 0 );
+    }
+
+    // Create the plan.
+    if ( use_topology )
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks, neighbor_ranks );
+    else
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks );
+
+    // Make some data to migrate.
+    using DataTypes = Cabana::MemberTypes<int,double[2]>;
+    using AoSoA_t = Cabana::AoSoA<DataTypes,TEST_MEMSPACE>;
+    AoSoA_t data( num_data );
+    auto slice_int = data.slice<0>();
+    auto slice_dbl = data.slice<1>();
+
+    // Fill the data.
+    auto fill_func =
+        KOKKOS_LAMBDA( const int i )
+        {
+            slice_int(i) = my_rank;
+            slice_dbl(i,0) = my_rank;
+            slice_dbl(i,1) = my_rank + 0.5;
+        };
+    Kokkos::RangePolicy<TEST_EXECSPACE> range_policy( 0, num_data );
+    Kokkos::parallel_for( range_policy, fill_func );
+    Kokkos::fence();
+
+    // Do the migration
+    Cabana::migrate( *distributor, data );
+
+    // Check the change in size.
+    if ( 0 == my_rank )
+        EXPECT_EQ( data.size(), my_size );
+    else
+        EXPECT_EQ( data.size(), 0 );
+
+    // Check the migration.
+    Cabana::AoSoA<DataTypes,Cabana::HostSpace>
+        data_host( distributor->totalNumImport() );
+    auto slice_int_host = data_host.slice<0>();
+    auto slice_dbl_host = data_host.slice<1>();
+    Cabana::deep_copy( data_host, data );
+    auto steering = distributor->getExportSteering();
+    auto host_steering = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), steering );
+    for ( int i = 0; i < distributor->totalNumImport(); ++i )
+    {
+        EXPECT_EQ( slice_int_host(i), distributor->neighborRank(i) );
+        EXPECT_EQ( slice_dbl_host(i,0), distributor->neighborRank(i) );
+        EXPECT_EQ( slice_dbl_host(i,1), distributor->neighborRank(i) + 0.5 );
+    }
+}
+
+//---------------------------------------------------------------------------//
+void test7( const bool use_topology )
+{
+    // Make a communication plan.
+    std::shared_ptr<Cabana::Distributor<TEST_MEMSPACE> > distributor;
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get the comm size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Rank 0 starts with all the data and sends one element to every rank.
+    int num_data = (0 == my_rank) ? my_size : 0;
+    Kokkos::View<int*,TEST_MEMSPACE>
+        export_ranks( "export_ranks", num_data );
+    auto fill_ranks =
+        KOKKOS_LAMBDA( const int i )
+        { export_ranks(i) = i; };
+    Kokkos::RangePolicy<TEST_EXECSPACE> range_policy( 0, num_data );
+    Kokkos::parallel_for( range_policy, fill_ranks );
+    Kokkos::fence();
+    std::vector<int> neighbor_ranks;
+    if ( 0 == my_rank )
+    {
+        neighbor_ranks.resize( my_size );
+        std::iota( neighbor_ranks.begin(), neighbor_ranks.end(), 0 );
+    }
+    else
+    {
+        neighbor_ranks.assign( 1, 0 );
+    }
+
+    // Create the plan.
+    if ( use_topology )
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks, neighbor_ranks );
+    else
+        distributor = std::make_shared<Cabana::Distributor<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, export_ranks );
+
+    // Make some data to migrate.
+    using DataTypes = Cabana::MemberTypes<int,double[2]>;
+    using AoSoA_t = Cabana::AoSoA<DataTypes,TEST_MEMSPACE>;
+    AoSoA_t data( num_data );
+    auto slice_int = data.slice<0>();
+    auto slice_dbl = data.slice<1>();
+
+    // Fill the data.
+    auto fill_func =
+        KOKKOS_LAMBDA( const int i )
+        {
+            slice_int(i) = i;
+            slice_dbl(i,0) = i;
+            slice_dbl(i,1) = i + 0.5;
+        };
+    Kokkos::parallel_for( range_policy, fill_func );
+    Kokkos::fence();
+
+    // Do the migration
+    Cabana::migrate( *distributor, data );
+
+    // Check the change in size.
+    EXPECT_EQ( data.size(), 1 );
+
+    // Check the migration.
+    Cabana::AoSoA<DataTypes,Cabana::HostSpace>
+        data_host( distributor->totalNumImport() );
+    auto slice_int_host = data_host.slice<0>();
+    auto slice_dbl_host = data_host.slice<1>();
+    Cabana::deep_copy( data_host, data );
+    EXPECT_EQ( slice_int_host(0), my_rank );
+    EXPECT_EQ( slice_dbl_host(0,0), my_rank );
+    EXPECT_EQ( slice_dbl_host(0,1), my_rank + 0.5 );
+}
+
+//---------------------------------------------------------------------------//
+// RUN TESTS
+//---------------------------------------------------------------------------//
+TEST_F( TEST_CATEGORY, distributor_test_1 )
+{ test1(true); }
+
+TEST_F( TEST_CATEGORY, distributor_test_2 )
+{ test2(true); }
+
+TEST_F( TEST_CATEGORY, distributor_test_3 )
+{ test3(true); }
+
+TEST_F( TEST_CATEGORY, distributor_test_4 )
+{ test4(true); }
+
+TEST_F( TEST_CATEGORY, distributor_test_5 )
+{ test5(true); }
+
+TEST_F( TEST_CATEGORY, distributor_test_6 )
+{ test6(true); }
+
+TEST_F( TEST_CATEGORY, distributor_test_7 )
+{ test7(true); }
+
+TEST_F( TEST_CATEGORY, distributor_test_1_no_topo )
+{ test1(false); }
+
+TEST_F( TEST_CATEGORY, distributor_test_2_no_topo )
+{ test2(false); }
+
+TEST_F( TEST_CATEGORY, distributor_test_3_no_topo )
+{ test3(false); }
+
+TEST_F( TEST_CATEGORY, distributor_test_4_no_topo )
+{ test4(false); }
+
+TEST_F( TEST_CATEGORY, distributor_test_5_no_topo )
+{ test5(false); }
+
+TEST_F( TEST_CATEGORY, distributor_test_6_no_topo )
+{ test6(false); }
+
+TEST_F( TEST_CATEGORY, distributor_test_7_no_topo )
+{ test7(false); }
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Test

From 457977064bbf0f221fb32a11fa2a3f1877bf1f2a Mon Sep 17 00:00:00 2001
From: Stuart Slattery <slatterysr@ornl.gov>
Date: Fri, 21 Dec 2018 12:48:17 -0500
Subject: [PATCH 3/3] adding halo communication plan for halo exchange of ghost
 data

Co-authored-by: dalg24 <dalg24@users.noreply.github.com>
---
 core/src/Cabana_Halo.hpp                   | 670 +++++++++++++++++++++
 core/unit_test/Cuda/tstHalo_Cuda.cpp       |  13 +
 core/unit_test/OpenMP/tstHalo_OpenMP.cpp   |  13 +
 core/unit_test/Pthread/tstHalo_Pthread.cpp |  13 +
 core/unit_test/Serial/tstHalo_Serial.cpp   |  13 +
 core/unit_test/mpi_unit_test_main.cpp      |  27 +
 core/unit_test/tstHalo.hpp                 | 424 +++++++++++++
 7 files changed, 1173 insertions(+)
 create mode 100644 core/src/Cabana_Halo.hpp
 create mode 100644 core/unit_test/Cuda/tstHalo_Cuda.cpp
 create mode 100644 core/unit_test/OpenMP/tstHalo_OpenMP.cpp
 create mode 100644 core/unit_test/Pthread/tstHalo_Pthread.cpp
 create mode 100644 core/unit_test/Serial/tstHalo_Serial.cpp
 create mode 100644 core/unit_test/mpi_unit_test_main.cpp
 create mode 100644 core/unit_test/tstHalo.hpp

diff --git a/core/src/Cabana_Halo.hpp b/core/src/Cabana_Halo.hpp
new file mode 100644
index 000000000..9ee978074
--- /dev/null
+++ b/core/src/Cabana_Halo.hpp
@@ -0,0 +1,670 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#ifndef CABANA_HALO_HPP
+#define CABANA_HALO_HPP
+
+#include <Cabana_CommunicationPlan.hpp>
+#include <Cabana_AoSoA.hpp>
+#include <Cabana_Slice.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <mpi.h>
+
+#include <vector>
+#include <exception>
+#include <cassert>
+
+namespace Cabana
+{
+//---------------------------------------------------------------------------//
+/*!
+  \class Halo
+
+  \brief Halo communication plan for scattering and gathering of ghosted
+  data.
+
+  \tparam MemorySpace Memory space in which the data for this class will be
+  allocated.
+
+  The halo allows for scatter and gather operations between locally-owned and
+  ghosted data. All data in the Halo (e.g. export and import data) is from the
+  point of view of the forward *GATHER* operation such that, for example, the
+  number of exports is the number of exports in the gather and the number of
+  imports is the number of imports in the gather. The reverse *SCATTER*
+  operation sends the ghosted data back the the uniquely-owned decomposition
+  and resolves collisions with atomic addition. Based on input for the forward
+  communication plan (where local data will be sent) the local number of
+  ghosts is computed. Some nomenclature:
+
+  Export - the local data we uniquely own that we will send to other ranks for
+  those ranks to be used as ghosts. Export is used in the context of the
+  forward communication plan (the gather).
+
+  Import - the ghost data that we get from other ranks. The rank we get a
+  ghost from is the unique owner of that data. Import is used in the context
+  of the forward communication plan (the gather).
+*/
+template<class MemorySpace>
+class Halo : public CommunicationPlan<MemorySpace>
+{
+  public:
+
+    /*!
+      \brief Neighbor and export rank constructor. Use this when you already
+      know which ranks neighbor each other (i.e. every rank already knows who
+      they will be exporting to and receiving from) as it will be more
+      efficient. In this case you already know the topology of the
+      point-to-point communication but not how much data to send and receive
+      from the neighbors.
+
+      \tparam IdViewType The container type for the export element ids. This
+      container type can be either a Kokkos View or a Cabana Slice.
+
+      \tparam RankViewType The container type for the export element
+      ranks. This container type can be either a Kokkos View or a Cabana
+      Slice.
+
+      \param comm The MPI communicator over which the halo is defined.
+
+      \param num_local The number of locally-owned elements on this rank.
+
+      \param element_export_ids The local ids of the elements that will be
+      exported to other ranks to be used as ghosts. Element ids may be
+      repeated in this list if they are sent to multiple destinations. Must be
+      the same length as element_export_ranks. The input is expected to be a
+      Kokkos view or Cabana slice in the same memory space as the
+      communication plan.
+
+      \param element_export_ranks The ranks to which we will send each element
+      in element_export_ids. In this case each rank must be one of the
+      neighbor ranks. Must be the same length as element_export_ids. A rank is
+      allowed to send to itself. The input is expected to be a Kokkos view or
+      Cabana slice in the same memory space as the communication plan.
+
+      \param neighbor_ranks List of ranks this rank will send to and receive
+      from. This list can include the calling rank. This is effectively a
+      description of the topology of the point-to-point communication plan.
+
+      \param mpi_tag The MPI tag to use for non-blocking communication in the
+      communication plan generation.
+
+      \note Calling this function completely updates the state of this object
+      and invalidates the previous state.
+    */
+    template<class IdViewType, class RankViewType>
+    Halo( MPI_Comm comm,
+          const std::size_t num_local,
+          const IdViewType& element_export_ids,
+          const RankViewType& element_export_ranks,
+          const std::vector<int>& neighbor_ranks,
+          const int mpi_tag = 1221 )
+        : CommunicationPlan<MemorySpace>( comm )
+        , _num_local( num_local )
+    {
+        if ( element_export_ids.size() != element_export_ranks.size() )
+            throw std::runtime_error("Export ids and ranks different sizes!");
+
+        this->createFromExportsAndTopology(
+            element_export_ranks, neighbor_ranks, mpi_tag );
+        this->createExportSteering( element_export_ranks, element_export_ids );
+    }
+
+    /*!
+      \brief Export rank constructor. Use this when you don't know who you
+      will receiving from - only who you are sending to. This is less
+      efficient than if we already knew who our neighbors were because we have
+      to determine the topology of the point-to-point communication first.
+
+      \tparam IdViewType The container type for the export element ids. This
+      container type can be either a Kokkos View or a Cabana Slice.
+
+      \tparam RankViewType The container type for the export element
+      ranks. This container type can be either a Kokkos View or a Cabana
+      Slice.
+
+      \param comm The MPI communicator over which the halo is defined.
+
+      \param num_local The number of locally-owned elements on this rank.
+
+      \param element_export_ids The local ids of the elements that will be
+      sent to other ranks to be used as ghosts. Element ids may be repeated in
+      this list if they are sent to multiple destinations. Must be the same
+      length as element_export_ranks. The input is expected to be a Kokkos
+      view or Cabana slice in the same memory space as the communication plan.
+
+      \param element_export_ranks The ranks to which we will export each element
+      in element_export_ids. Must be the same length as
+      element_export_ids. The neighbor ranks will be determined from this
+      list. A rank is allowed to send to itself. The input is expected to be a
+      Kokkos view or Cabana slice in the same memory space as the
+      communication plan.
+
+      \param mpi_tag The MPI tag to use for non-blocking communication in the
+      communication plan generation.
+
+      \note Calling this function completely updates the state of this object
+      and invalidates the previous state.
+    */
+    template<class IdViewType, class RankViewType>
+    Halo( MPI_Comm comm,
+          const std::size_t num_local,
+          const IdViewType& element_export_ids,
+          const RankViewType& element_export_ranks,
+          const int mpi_tag = 1221 )
+        : CommunicationPlan<MemorySpace>( comm )
+        , _num_local( num_local )
+    {
+        if ( element_export_ids.size() != element_export_ranks.size() )
+            throw std::runtime_error("Export ids and ranks different sizes!");
+
+        this->createFromExportsOnly( element_export_ranks, mpi_tag );
+        this->createExportSteering( element_export_ranks, element_export_ids );
+    }
+
+    /*!
+      \brief Get the number of elements locally owned by this rank.
+
+      \return THe number of elements locally owned by this rank.
+    */
+    std::size_t numLocal() const
+    { return _num_local; }
+
+    /*!
+      \brief Get the number of ghost elements this rank. Use this to resize a
+      data structure for scatter/gather operations. For use with scatter
+      gather, a data structure should be of size numLocal() + numGhost().
+
+      \return The number of ghosted elements on this rank.
+    */
+    std::size_t numGhost() const
+    { return this->totalNumImport(); }
+
+  private:
+
+    std::size_t _num_local;
+};
+
+//---------------------------------------------------------------------------//
+// Static type checker.
+template<typename >
+struct is_halo : public std::false_type {};
+
+template<typename MemorySpace>
+struct is_halo<Halo<MemorySpace> >
+    : public std::true_type {};
+
+template<typename MemorySpace>
+struct is_halo<const Halo<MemorySpace> >
+    : public std::true_type {};
+
+//---------------------------------------------------------------------------//
+/*!
+  \brief Synchronously gather data from the local decomposition to the ghosts
+  using the halo forward communication plan. AoSoA version. This is a
+  uniquely-owned to multiply-owned communication.
+
+  A gather sends data from a locally owned elements to one or many ranks on
+  which they exist as ghosts. A locally owned element may be sent to as many
+  ranks as desired to be used as a ghost on those ranks. The value of the
+  element in the locally owned decomposition will be the value assigned to the
+  element in the ghosted decomposition.
+
+  \tparam Halo_t Halo type - must be a Halo.
+
+  \tparam AoSoA_t AoSoA type - must be an AoSoA.
+
+  \param halo The halo to use for the gather.
+
+  \param aosoa The AoSoA on which to perform the gather. The AoSoA should have
+  a size equivalent to halo.numGhost() + halo.numLocal(). The locally owned
+  elements are expected to appear first (i.e. in the first halo.numLocal()
+  elements) and the ghosted elements are expected to appear second (i.e. in
+  the next halo.numGhost() elements()).
+
+  \param mpi_tag The MPI tag to use for non-blocking communication in the
+  gather. Note here that if multiple instances of this function are being
+  called at once then different tags should be used in each function call to
+  avoid any communication conflicts.
+*/
+template<class Halo_t, class AoSoA_t>
+void gather( const Halo_t& halo,
+             AoSoA_t& aosoa,
+             int mpi_tag = 1002,
+             typename std::enable_if<(is_halo<Halo_t>::value &&
+                                      is_aosoa<AoSoA_t>::value),
+             int>::type * = 0 )
+{
+    // Check that the AoSoA is the right size.
+    if ( aosoa.size() != halo.numLocal() + halo.numGhost() )
+        throw std::runtime_error("AoSoA is the wrong size for scatter!");
+
+    // Allocate a send buffer.
+    Kokkos::View<typename AoSoA_t::tuple_type*,
+                 typename Halo_t::memory_space>
+        send_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("halo_send_buffer"),
+            halo.totalNumExport() );
+
+    // Get the steering vector for the sends.
+    auto steering = halo.getExportSteering();
+
+    // Gather from the local data into a tuple-contiguous send buffer.
+    auto gather_send_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        {
+            send_buffer( i ) = aosoa.getTuple( steering(i) );
+        };
+    Kokkos::RangePolicy<typename Halo_t::execution_space>
+        gather_send_buffer_policy( 0, halo.totalNumExport() );
+    Kokkos::parallel_for( "Cabana::gather::gather_send_buffer",
+                          gather_send_buffer_policy,
+                          gather_send_buffer_func );
+    Kokkos::fence();
+
+    // Allocate a receive buffer.
+    Kokkos::View<typename AoSoA_t::tuple_type*,
+                 typename Halo_t::memory_space>
+        recv_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("halo_recv_buffer"),
+            halo.totalNumImport() );
+
+    // Post non-blocking receives.
+    int num_n = halo.numNeighbor();
+    std::vector<MPI_Request> requests( num_n );
+    std::pair<std::size_t,std::size_t> recv_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        recv_range.second =
+            recv_range.first + halo.numImport(n);
+
+        auto recv_subview = Kokkos::subview( recv_buffer, recv_range );
+
+        MPI_Irecv( recv_subview.data(),
+                   recv_subview.size() * sizeof(typename AoSoA_t::tuple_type),
+                   MPI_BYTE,
+                   halo.neighborRank(n),
+                   mpi_tag,
+                   halo.comm(),
+                   &(requests[n]) );
+
+        recv_range.first = recv_range.second;
+    }
+
+    // Do blocking sends.
+    std::pair<std::size_t,std::size_t> send_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        send_range.second =
+            send_range.first + halo.numExport(n);
+
+        auto send_subview = Kokkos::subview( send_buffer, send_range );
+
+        MPI_Send( send_subview.data(),
+                  send_subview.size() * sizeof(typename AoSoA_t::tuple_type),
+                  MPI_BYTE,
+                  halo.neighborRank(n),
+                  mpi_tag,
+                  halo.comm() );
+
+        send_range.first = send_range.second;
+    }
+
+    // Wait on non-blocking receives.
+    std::vector<MPI_Status> status( num_n );
+    const int ec =
+        MPI_Waitall( requests.size(), requests.data(), status.data() );
+    assert( MPI_SUCCESS == ec );
+
+    // Extract the receive buffer into the ghosted elements.
+    std::size_t num_local = halo.numLocal();
+    auto extract_recv_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        {
+            std::size_t ghost_idx = i + num_local;
+            aosoa.setTuple( ghost_idx, recv_buffer(i) );
+        };
+    Kokkos::RangePolicy<typename Halo_t::execution_space>
+        extract_recv_buffer_policy( 0, halo.totalNumImport() );
+    Kokkos::parallel_for( "Cabana::gather::extract_recv_buffer",
+                          extract_recv_buffer_policy,
+                          extract_recv_buffer_func );
+    Kokkos::fence();
+
+    // Barrier before completing to ensure synchronization.
+    MPI_Barrier( halo.comm() );
+}
+
+//---------------------------------------------------------------------------//
+/*!
+  \brief Synchronously gather data from the local decomposition to the ghosts
+  using the halo forward communication plan. Slice version. This is a
+  uniquely-owned to multiply-owned communication.
+
+  A gather sends data from a locally owned elements to one or many ranks on
+  which they exist as ghosts. A locally owned element may be sent to as many
+  ranks as desired to be used as a ghost on those ranks. The value of the
+  element in the locally owned decomposition will be the value assigned to the
+  element in the ghosted decomposition.
+
+  \tparam Halo_t Halo type - must be a Halo.
+
+  \tparam Slice_t Slice type - must be a Slice.
+
+  \param halo The halo to use for the gather.
+
+  \param slice The Slice on which to perform the gather. The Slice should have
+  a size equivalent to halo.numGhost() + halo.numLocal(). The locally owned
+  elements are expected to appear first (i.e. in the first halo.numLocal()
+  elements) and the ghosted elements are expected to appear second (i.e. in
+  the next halo.numGhost() elements()).
+
+  \param mpi_tag The MPI tag to use for non-blocking communication in the
+  gather. Note here that if multiple instances of this function are being
+  called at once then different tags should be used in each function call to
+  avoid any communication conflicts.
+*/
+template<class Halo_t, class Slice_t>
+void gather( const Halo_t& halo,
+             Slice_t& slice,
+             int mpi_tag = 1002,
+             typename std::enable_if<(is_halo<Halo_t>::value &&
+                                      is_slice<Slice_t>::value),
+             int>::type * = 0 )
+{
+    // Check that the Slice is the right size.
+    if ( slice.size() != halo.numLocal() + halo.numGhost() )
+        throw std::runtime_error("Slice is the wrong size for scatter!");
+
+    // Get the number of components in the slice.
+    int num_comp = 1;
+    for ( int d = 2; d < slice.rank(); ++d )
+        num_comp *= slice.extent(d);
+
+    // Get the raw slice data.
+    auto slice_data = slice.data();
+
+    // Allocate a send buffer. Note this one is layout right so the components
+    // are consecutive.
+    Kokkos::View<typename Slice_t::value_type**,
+                 Kokkos::LayoutRight,
+                 typename Halo_t::memory_space>
+        send_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("halo_send_buffer"),
+            halo.totalNumExport(), num_comp );
+
+    // Get the steering vector for the sends.
+    auto steering = halo.getExportSteering();
+
+    // Gather from the local data into a tuple-contiguous send buffer.
+    auto gather_send_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        {
+            auto s = Slice_t::index_type::s( steering(i) );
+            auto a = Slice_t::index_type::a( steering(i) );
+            std::size_t slice_offset = s*slice.stride(0) + a;
+            for ( int n = 0; n < num_comp; ++n )
+                send_buffer( i, n ) =
+                    slice_data[ slice_offset + n * Slice_t::vector_length ];
+        };
+    Kokkos::RangePolicy<typename Halo_t::execution_space>
+        gather_send_buffer_policy( 0, halo.totalNumExport() );
+    Kokkos::parallel_for( "Cabana::gather::gather_send_buffer",
+                          gather_send_buffer_policy,
+                          gather_send_buffer_func );
+    Kokkos::fence();
+
+    // Allocate a receive buffer. Note this one is layout right so the components
+    // are consecutive.
+    Kokkos::View<typename Slice_t::value_type**,
+                 Kokkos::LayoutRight,
+                 typename Halo_t::memory_space>
+        recv_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("halo_recv_buffer"),
+            halo.totalNumImport(), num_comp );
+
+    // Post non-blocking receives.
+    int num_n = halo.numNeighbor();
+    std::vector<MPI_Request> requests( num_n );
+    std::pair<std::size_t,std::size_t> recv_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        recv_range.second = recv_range.first + halo.numImport(n);
+
+        auto recv_subview =
+            Kokkos::subview( recv_buffer, recv_range, Kokkos::ALL );
+
+        MPI_Irecv( recv_subview.data(),
+                   recv_subview.size() * sizeof(typename Slice_t::value_type),
+                   MPI_BYTE,
+                   halo.neighborRank(n),
+                   mpi_tag,
+                   halo.comm(),
+                   &(requests[n]) );
+
+        recv_range.first = recv_range.second;
+    }
+
+    // Do blocking sends.
+    std::pair<std::size_t,std::size_t> send_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        send_range.second = send_range.first + halo.numExport(n);
+
+        auto send_subview =
+            Kokkos::subview( send_buffer, send_range, Kokkos::ALL );
+
+        MPI_Send( send_subview.data(),
+                  send_subview.size() * sizeof(typename Slice_t::value_type),
+                  MPI_BYTE,
+                  halo.neighborRank(n),
+                  mpi_tag,
+                  halo.comm() );
+
+        send_range.first = send_range.second;
+    }
+
+    // Wait on non-blocking receives.
+    std::vector<MPI_Status> status( num_n );
+    const int ec =
+        MPI_Waitall( requests.size(), requests.data(), status.data() );
+    assert( MPI_SUCCESS == ec );
+
+    // Extract the receive buffer into the ghosted elements.
+    std::size_t num_local = halo.numLocal();
+    auto extract_recv_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        {
+            std::size_t ghost_idx = i + num_local;
+            auto s = Slice_t::index_type::s( ghost_idx );
+            auto a = Slice_t::index_type::a( ghost_idx );
+            std::size_t slice_offset = s*slice.stride(0) + a;
+            for ( int n = 0; n < num_comp; ++n )
+                slice_data[ slice_offset + Slice_t::vector_length * n ] =
+                    recv_buffer( i, n );
+        };
+    Kokkos::RangePolicy<typename Halo_t::execution_space>
+        extract_recv_buffer_policy( 0, halo.totalNumImport() );
+    Kokkos::parallel_for( "Cabana::gather::extract_recv_buffer",
+                          extract_recv_buffer_policy,
+                          extract_recv_buffer_func );
+    Kokkos::fence();
+
+    // Barrier before completing to ensure synchronization.
+    MPI_Barrier( halo.comm() );
+}
+
+//---------------------------------------------------------------------------//
+/*!
+  \brief Synchronously scatter data from the ghosts to the local decomposition
+  of a slice using the halo reverse communication plan. This is a
+  multiply-owned to uniquely owned communication.
+
+  In a scatter operation results from ghosted values on other processors are
+  scattered back to the owning processor of the ghost and the value associated
+  with the ghost is summed into the locally owned value the ghost
+  represents. If a locally owned element is ghosted on multiple ranks, then
+  multiple contributions will be made to the sum, one for each rank.
+
+  \tparam Halo_t Halo type - must be a Halo.
+
+  \tparam Slice_t Slice type - must be a Slice.
+
+  \param halo The halo to use for the scatter.
+
+  \param slice The Slice on which to perform the scatter. The Slice should have
+  a size equivalent to halo.numGhost() + halo.numLocal(). The locally owned
+  elements are expected to appear first (i.e. in the first halo.numLocal()
+  elements) and the ghosted elements are expected to appear second (i.e. in
+  the next halo.numGhost() elements()).
+
+  \param mpi_tag The MPI tag to use for non-blocking communication in the
+  scatter. Note here that if multiple instances of this function are being
+  called at once then different tags should be used in each function call to
+  avoid any communication conflicts.
+*/
+template<class Halo_t, class Slice_t>
+void scatter( const Halo_t& halo,
+              Slice_t& slice,
+              int mpi_tag = 1003,
+              typename std::enable_if<(is_halo<Halo_t>::value &&
+                                       is_slice<Slice_t>::value),
+              int>::type * = 0 )
+{
+    // Check that the Slice is the right size.
+    if ( slice.size() != halo.numLocal() + halo.numGhost() )
+        throw std::runtime_error("Slice is the wrong size for scatter!");
+
+    // Get the number of components in the slice.
+    int num_comp = 1;
+    for ( int d = 2; d < slice.rank(); ++d )
+        num_comp *= slice.extent(d);
+
+    // Get the raw slice data.
+    auto slice_data = slice.data();
+
+    // Allocate a send buffer. Note this one is layout right so the components
+    // are consecutive.
+    Kokkos::View<typename Slice_t::value_type**,
+                 Kokkos::LayoutRight,
+                 typename Halo_t::memory_space>
+        send_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("halo_send_buffer"),
+            halo.totalNumImport(), num_comp );
+
+    // Extract the send buffer from the ghosted elements.
+    std::size_t num_local = halo.numLocal();
+    auto extract_send_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        {
+            std::size_t ghost_idx = i + num_local;
+            auto s = Slice_t::index_type::s( ghost_idx );
+            auto a = Slice_t::index_type::a( ghost_idx );
+            std::size_t slice_offset = s*slice.stride(0) + a;
+            for ( int n = 0; n < num_comp; ++n )
+                send_buffer( i, n ) =
+                    slice_data[ slice_offset + Slice_t::vector_length * n ];
+        };
+    Kokkos::RangePolicy<typename Halo_t::execution_space>
+        extract_send_buffer_policy( 0, halo.totalNumImport() );
+    Kokkos::parallel_for( "Cabana::scatter::extract_send_buffer",
+                          extract_send_buffer_policy,
+                          extract_send_buffer_func );
+    Kokkos::fence();
+
+    // Allocate a receive buffer. Note this one is layout right so the components
+    // are consecutive.
+    Kokkos::View<typename Slice_t::value_type**,
+                 Kokkos::LayoutRight,
+                 typename Halo_t::memory_space>
+        recv_buffer(
+            Kokkos::ViewAllocateWithoutInitializing("halo_recv_buffer"),
+            halo.totalNumExport(), num_comp );
+
+    // Post non-blocking receives.
+    int num_n = halo.numNeighbor();
+    std::vector<MPI_Request> requests( num_n );
+    std::pair<std::size_t,std::size_t> recv_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        recv_range.second = recv_range.first + halo.numExport(n);
+
+        auto recv_subview =
+            Kokkos::subview( recv_buffer, recv_range, Kokkos::ALL );
+
+        MPI_Irecv( recv_subview.data(),
+                   recv_subview.size() * sizeof(typename Slice_t::value_type),
+                   MPI_BYTE,
+                   halo.neighborRank(n),
+                   mpi_tag,
+                   halo.comm(),
+                   &(requests[n]) );
+
+        recv_range.first = recv_range.second;
+    }
+
+    // Do blocking sends.
+    std::pair<std::size_t,std::size_t> send_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        send_range.second = send_range.first + halo.numImport(n);
+
+        auto send_subview =
+            Kokkos::subview( send_buffer, send_range, Kokkos::ALL );
+
+        MPI_Send( send_subview.data(),
+                  send_subview.size() * sizeof(typename Slice_t::value_type),
+                  MPI_BYTE,
+                  halo.neighborRank(n),
+                  mpi_tag,
+                  halo.comm() );
+
+        send_range.first = send_range.second;
+    }
+
+    // Wait on non-blocking receives.
+    std::vector<MPI_Status> status( num_n );
+    const int ec =
+        MPI_Waitall( requests.size(), requests.data(), status.data() );
+    assert( MPI_SUCCESS == ec );
+
+    // Get the steering vector for the sends.
+    auto steering = halo.getExportSteering();
+
+    // Scatter the ghosts in the receive buffer into the local values.
+    auto scatter_recv_buffer_func =
+        KOKKOS_LAMBDA( const std::size_t i )
+        {
+            auto s = Slice_t::index_type::s( steering(i) );
+            auto a = Slice_t::index_type::a( steering(i) );
+            std::size_t slice_offset = s*slice.stride(0) + a;
+            for ( int n = 0; n < num_comp; ++n )
+                Kokkos::atomic_add(
+                    slice_data + slice_offset + Slice_t::vector_length * n,
+                    recv_buffer(i,n) );
+        };
+    Kokkos::RangePolicy<typename Halo_t::execution_space>
+        scatter_recv_buffer_policy( 0, halo.totalNumExport() );
+    Kokkos::parallel_for( "Cabana::scatter::scatter_recv_buffer",
+                          scatter_recv_buffer_policy,
+                          scatter_recv_buffer_func );
+    Kokkos::fence();
+
+    // Barrier before completing to ensure synchronization.
+    MPI_Barrier( halo.comm() );
+}
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Cabana
+
+#endif // end CABANA_HALO_HPP
diff --git a/core/unit_test/Cuda/tstHalo_Cuda.cpp b/core/unit_test/Cuda/tstHalo_Cuda.cpp
new file mode 100644
index 000000000..d8756f956
--- /dev/null
+++ b/core/unit_test/Cuda/tstHalo_Cuda.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Cuda/TestCuda_Category.hpp>
+#include <tstHalo.hpp>
diff --git a/core/unit_test/OpenMP/tstHalo_OpenMP.cpp b/core/unit_test/OpenMP/tstHalo_OpenMP.cpp
new file mode 100644
index 000000000..d2588dd12
--- /dev/null
+++ b/core/unit_test/OpenMP/tstHalo_OpenMP.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <OpenMP/TestOpenMP_Category.hpp>
+#include <tstHalo.hpp>
diff --git a/core/unit_test/Pthread/tstHalo_Pthread.cpp b/core/unit_test/Pthread/tstHalo_Pthread.cpp
new file mode 100644
index 000000000..bd5db3ecb
--- /dev/null
+++ b/core/unit_test/Pthread/tstHalo_Pthread.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Pthread/TestPthread_Category.hpp>
+#include <tstHalo.hpp>
diff --git a/core/unit_test/Serial/tstHalo_Serial.cpp b/core/unit_test/Serial/tstHalo_Serial.cpp
new file mode 100644
index 000000000..b0b1f1a45
--- /dev/null
+++ b/core/unit_test/Serial/tstHalo_Serial.cpp
@@ -0,0 +1,13 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Serial/TestSerial_Category.hpp>
+#include <tstHalo.hpp>
diff --git a/core/unit_test/mpi_unit_test_main.cpp b/core/unit_test/mpi_unit_test_main.cpp
new file mode 100644
index 000000000..9f548cb21
--- /dev/null
+++ b/core/unit_test/mpi_unit_test_main.cpp
@@ -0,0 +1,27 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <mpi.h>
+
+int main( int argc, char* argv[] )
+{
+    MPI_Init( &argc, &argv );
+    Kokkos::initialize( argc, argv );
+    ::testing::InitGoogleTest( &argc, argv );
+    int return_val = RUN_ALL_TESTS();
+    Kokkos::finalize();
+    MPI_Finalize();
+    return return_val;
+}
diff --git a/core/unit_test/tstHalo.hpp b/core/unit_test/tstHalo.hpp
new file mode 100644
index 000000000..36b5c3972
--- /dev/null
+++ b/core/unit_test/tstHalo.hpp
@@ -0,0 +1,424 @@
+/****************************************************************************
+ * Copyright (c) 2018 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Cabana_Halo.hpp>
+#include <Cabana_AoSoA.hpp>
+#include <Cabana_Parallel.hpp>
+#include <Cabana_DeepCopy.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <gtest/gtest.h>
+
+#include <mpi.h>
+
+#include <vector>
+#include <memory>
+
+namespace Test
+{
+
+//---------------------------------------------------------------------------//
+// test without collisions
+void test1( const bool use_topology )
+{
+    // Make a communication plan.
+    std::shared_ptr<Cabana::Halo<TEST_MEMSPACE> > halo;
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get my size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Every rank will send ghosts to all other ranks. Send one element to
+    // each rank including yourself. Interleave the sends. The resulting
+    // communication plan has ghosts that have one unique destination.
+    int num_local = 2 * my_size;
+    Kokkos::View<int*,Kokkos::HostSpace> export_ranks_host(
+        "export_ranks", my_size );
+    Kokkos::View<std::size_t*,Kokkos::HostSpace> export_ids_host(
+        "export_ids", my_size );
+    std::vector<int> neighbors( my_size );
+    for ( int n = 0; n < my_size; ++n )
+    {
+        neighbors[n] = n;
+        export_ranks_host(n) = n;
+        export_ids_host(n) = 2*n + 1;
+    }
+    auto export_ranks = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ranks_host );
+    auto export_ids = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ids_host );
+
+    // Create the plan.
+    if ( use_topology )
+        halo = std::make_shared<Cabana::Halo<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, num_local, export_ids, export_ranks, neighbors );
+    else
+        halo = std::make_shared<Cabana::Halo<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, num_local, export_ids, export_ranks );
+
+    // Check the plan.
+    EXPECT_EQ( halo->numLocal(), num_local );
+    EXPECT_EQ( halo->numGhost(), my_size );
+
+    // Create an AoSoA of local data with space allocated for local data.
+    using DataTypes = Cabana::MemberTypes<int,double[2]>;
+    using AoSoA_t = Cabana::AoSoA<DataTypes,TEST_MEMSPACE>;
+    AoSoA_t data( halo->numLocal() + halo->numGhost() );
+    auto slice_int = data.slice<0>();
+    auto slice_dbl = data.slice<1>();
+
+    // Fill the local data.
+    auto fill_func =
+        KOKKOS_LAMBDA( const int i )
+        {
+            slice_int(i) = my_rank + 1;
+            slice_dbl(i,0) = my_rank + 1;
+            slice_dbl(i,1) = my_rank + 1.5;
+        };
+    Kokkos::RangePolicy<TEST_EXECSPACE>
+        range_policy( 0, num_local );
+    Kokkos::parallel_for( range_policy, fill_func );
+    Kokkos::fence();
+
+    // Gather by AoSoA.
+    Cabana::gather( *halo, data );
+
+    // Check the results of the gather.
+    Cabana::AoSoA<DataTypes,Cabana::HostSpace> data_host(
+        halo->numLocal() + halo->numGhost() );
+    auto slice_int_host = data_host.slice<0>();
+    auto slice_dbl_host = data_host.slice<1>();
+    Cabana::deep_copy( data_host, data );
+
+    // check that the local data didn't change.
+    for ( int i = 0; i < my_size; ++i )
+    {
+        EXPECT_EQ( slice_int_host(2*i), my_rank + 1 );
+        EXPECT_EQ( slice_dbl_host(2*i,0), my_rank + 1 );
+        EXPECT_EQ( slice_dbl_host(2*i,1), my_rank + 1.5 );
+
+        EXPECT_EQ( slice_int_host(2*i + 1), my_rank + 1 );
+        EXPECT_EQ( slice_dbl_host(2*i + 1,0), my_rank + 1 );
+        EXPECT_EQ( slice_dbl_host(2*i + 1,1), my_rank + 1.5 );
+    }
+
+    // Check that we got one element from everyone.
+    for ( int i = num_local; i < num_local + my_size; ++i )
+    {
+        // Self sends are first.
+        int send_rank = i - num_local;
+        if ( send_rank == 0 )
+        {
+            EXPECT_EQ( slice_int_host(i), my_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), my_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), my_rank + 1.5 );
+        }
+        else if ( send_rank == my_rank )
+        {
+            EXPECT_EQ( slice_int_host(i), 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), 1.5 );
+        }
+        else
+        {
+            EXPECT_EQ( slice_int_host(i), send_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), send_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), send_rank + 1.5 );
+        }
+    }
+
+    // Scatter back the results,
+    Cabana::scatter( *halo, slice_int );
+    Cabana::scatter( *halo, slice_dbl );
+    Cabana::deep_copy( data_host, data );
+
+    // Check that the local data was updated. Every ghost had a unique
+    // destination so the result should be doubled for those elements that
+    // were ghosted.
+    for ( int i = 0; i < my_size; ++i )
+    {
+        EXPECT_EQ( slice_int_host(2*i), my_rank + 1 );
+        EXPECT_EQ( slice_dbl_host(2*i,0), my_rank + 1 );
+        EXPECT_EQ( slice_dbl_host(2*i,1), my_rank + 1.5 );
+
+        EXPECT_EQ( slice_int_host(2*i + 1), 2 * (my_rank + 1) );
+        EXPECT_EQ( slice_dbl_host(2*i + 1,0), 2 * (my_rank + 1 ) );
+        EXPECT_EQ( slice_dbl_host(2*i + 1,1), 2 * (my_rank + 1.5) );
+    }
+
+    // Check that the ghost data didn't change.
+    for ( int i = num_local; i < num_local + my_size; ++i )
+    {
+        // Self sends are first.
+        int send_rank = i - num_local;
+        if ( send_rank == 0 )
+        {
+            EXPECT_EQ( slice_int_host(i), my_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), my_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), my_rank + 1.5 );
+        }
+        else if ( send_rank == my_rank )
+        {
+            EXPECT_EQ( slice_int_host(i), 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), 1.5 );
+        }
+        else
+        {
+            EXPECT_EQ( slice_int_host(i), send_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), send_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), send_rank + 1.5 );
+        }
+    }
+
+    // Gather again, this time with slices.
+    Cabana::gather( *halo, slice_int );
+    Cabana::gather( *halo, slice_dbl );
+    Cabana::deep_copy( data_host, data );
+
+    // Check that the local data remained unchanged.
+    for ( int i = 0; i < my_size; ++i )
+    {
+        EXPECT_EQ( slice_int_host(2*i), my_rank + 1 );
+        EXPECT_EQ( slice_dbl_host(2*i,0), my_rank + 1 );
+        EXPECT_EQ( slice_dbl_host(2*i,1), my_rank + 1.5 );
+
+        EXPECT_EQ( slice_int_host(2*i + 1), 2 * (my_rank + 1) );
+        EXPECT_EQ( slice_dbl_host(2*i + 1,0), 2 * (my_rank + 1 ) );
+        EXPECT_EQ( slice_dbl_host(2*i + 1,1), 2 * (my_rank + 1.5) );
+    }
+
+    // Check that the ghost data was updated.
+    for ( int i = num_local; i < num_local + my_size; ++i )
+    {
+        // Self sends are first.
+        int send_rank = i - num_local;
+        if ( send_rank == 0 )
+        {
+            EXPECT_EQ( slice_int_host(i), 2 * (my_rank + 1) );
+            EXPECT_EQ( slice_dbl_host(i,0), 2 * (my_rank + 1) );
+            EXPECT_EQ( slice_dbl_host(i,1), 2 * (my_rank + 1.5) );
+        }
+        else if ( send_rank == my_rank )
+        {
+            EXPECT_EQ( slice_int_host(i), 2 );
+            EXPECT_EQ( slice_dbl_host(i,0), 2 );
+            EXPECT_EQ( slice_dbl_host(i,1), 3 );
+        }
+        else
+        {
+            EXPECT_EQ( slice_int_host(i), 2 * (send_rank + 1) );
+            EXPECT_EQ( slice_dbl_host(i,0), 2 * (send_rank + 1) );
+            EXPECT_EQ( slice_dbl_host(i,1), 2 * (send_rank + 1.5) );
+        }
+    }
+}
+
+//---------------------------------------------------------------------------//
+// test with collisions
+void test2( const bool use_topology )
+{
+    // Make a communication plan.
+    std::shared_ptr<Cabana::Halo<TEST_MEMSPACE> > halo;
+
+    // Get my rank.
+    int my_rank = -1;
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+
+    // Get my size.
+    int my_size = -1;
+    MPI_Comm_size( MPI_COMM_WORLD, &my_size );
+
+    // Every rank will send its single data point as ghosts to all other
+    // ranks. This will create collisions in the scatter as every rank will
+    // have data for this rank in the summation.
+    int num_local = 1;
+    Kokkos::View<int*,Kokkos::HostSpace> export_ranks_host(
+        "export_ranks", my_size );
+    Kokkos::View<std::size_t*,TEST_MEMSPACE>
+        export_ids( "export_ids", my_size );
+    Kokkos::deep_copy( export_ids, 0 );
+    std::vector<int> neighbors( my_size );
+    for ( int n = 0; n < my_size; ++n )
+    {
+        neighbors[n] = n;
+        export_ranks_host(n) = n;
+    }
+    auto export_ranks = Kokkos::create_mirror_view_and_copy(
+        TEST_MEMSPACE(), export_ranks_host );
+
+    // Create the plan.
+    if ( use_topology )
+        halo = std::make_shared<Cabana::Halo<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, num_local, export_ids, export_ranks, neighbors );
+    else
+        halo = std::make_shared<Cabana::Halo<TEST_MEMSPACE> >(
+            MPI_COMM_WORLD, num_local, export_ids, export_ranks );
+
+    // Check the plan.
+    EXPECT_EQ( halo->numLocal(), num_local );
+    EXPECT_EQ( halo->numGhost(), my_size );
+
+    // Create an AoSoA of local data with space allocated for local data.
+    using DataTypes = Cabana::MemberTypes<int,double[2]>;
+    using AoSoA_t = Cabana::AoSoA<DataTypes,TEST_MEMSPACE>;
+    AoSoA_t data( halo->numLocal() + halo->numGhost() );
+    auto slice_int = data.slice<0>();
+    auto slice_dbl = data.slice<1>();
+
+    // Fill the local data.
+    auto fill_func =
+        KOKKOS_LAMBDA( const int i )
+        {
+            slice_int(i) = my_rank + 1;
+            slice_dbl(i,0) = my_rank + 1;
+            slice_dbl(i,1) = my_rank + 1.5;
+        };
+    Kokkos::RangePolicy<TEST_EXECSPACE>
+        range_policy( 0, num_local );
+    Kokkos::parallel_for( range_policy, fill_func );
+    Kokkos::fence();
+
+    // Gather by AoSoA.
+    Cabana::gather( *halo, data );
+
+    // Check the results of the gather.
+    Cabana::AoSoA<DataTypes,Cabana::HostSpace> data_host(
+        halo->numLocal() + halo->numGhost() );
+    auto slice_int_host = data_host.slice<0>();
+    auto slice_dbl_host = data_host.slice<1>();
+    Cabana::deep_copy( data_host, data );
+
+    // check that the local data didn't change.
+    EXPECT_EQ( slice_int_host(0), my_rank + 1 );
+    EXPECT_EQ( slice_dbl_host(0,0), my_rank + 1 );
+    EXPECT_EQ( slice_dbl_host(0,1), my_rank + 1.5 );
+
+    // Check that we got one element from everyone.
+    for ( int i = num_local; i < num_local + my_size; ++i )
+    {
+        // Self sends are first.
+        int send_rank = i - num_local;
+        if ( send_rank == 0 )
+        {
+            EXPECT_EQ( slice_int_host(i), my_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), my_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), my_rank + 1.5 );
+        }
+        else if ( send_rank == my_rank )
+        {
+            EXPECT_EQ( slice_int_host(i), 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), 1.5 );
+        }
+        else
+        {
+            EXPECT_EQ( slice_int_host(i), send_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), send_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), send_rank + 1.5 );
+        }
+    }
+
+    // Scatter back the results,
+    Cabana::scatter( *halo, slice_int );
+    Cabana::scatter( *halo, slice_dbl );
+    Cabana::deep_copy( data_host, data );
+
+    // Check that the local data was updated. Every ghost was sent to all of
+    // the ranks so the result should be multiplied by the number of ranks.
+    EXPECT_EQ( slice_int_host(0), (my_size + 1) * (my_rank + 1) );
+    EXPECT_EQ( slice_dbl_host(0,0), (my_size + 1) * (my_rank + 1 ) );
+    EXPECT_EQ( slice_dbl_host(0,1), (my_size + 1) * (my_rank + 1.5) );
+
+    // Check that the ghost data didn't change.
+    for ( int i = num_local; i < num_local + my_size; ++i )
+    {
+        // Self sends are first.
+        int send_rank = i - num_local;
+        if ( send_rank == 0 )
+        {
+            EXPECT_EQ( slice_int_host(i), my_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), my_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), my_rank + 1.5 );
+        }
+        else if ( send_rank == my_rank )
+        {
+            EXPECT_EQ( slice_int_host(i), 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), 1.5 );
+        }
+        else
+        {
+            EXPECT_EQ( slice_int_host(i), send_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,0), send_rank + 1 );
+            EXPECT_EQ( slice_dbl_host(i,1), send_rank + 1.5 );
+        }
+    }
+
+    // Gather again, this time with slices.
+    Cabana::gather( *halo, slice_int );
+    Cabana::gather( *halo, slice_dbl );
+    Cabana::deep_copy( data_host, data );
+
+    // Check that the local data remained unchanged.
+    EXPECT_EQ( slice_int_host(0), (my_size + 1) * (my_rank + 1) );
+    EXPECT_EQ( slice_dbl_host(0,0), (my_size + 1) * (my_rank + 1 ) );
+    EXPECT_EQ( slice_dbl_host(0,1), (my_size + 1) * (my_rank + 1.5) );
+
+    // Check that the ghost data was updated.
+    for ( int i = num_local; i < num_local + my_size; ++i )
+    {
+        // Self sends are first.
+        int send_rank = i - num_local;
+        if ( send_rank == 0 )
+        {
+            EXPECT_EQ( slice_int_host(i), (my_size + 1) * (my_rank + 1) );
+            EXPECT_EQ( slice_dbl_host(i,0), (my_size + 1) * (my_rank + 1) );
+            EXPECT_EQ( slice_dbl_host(i,1), (my_size + 1) * (my_rank + 1.5) );
+        }
+        else if ( send_rank == my_rank )
+        {
+            EXPECT_EQ( slice_int_host(i), (my_size + 1) );
+            EXPECT_EQ( slice_dbl_host(i,0), (my_size + 1) );
+            EXPECT_EQ( slice_dbl_host(i,1), (my_size + 1) * 1.5 );
+        }
+        else
+        {
+            EXPECT_EQ( slice_int_host(i), (my_size + 1) * (send_rank + 1) );
+            EXPECT_EQ( slice_dbl_host(i,0), (my_size + 1) * (send_rank + 1) );
+            EXPECT_EQ( slice_dbl_host(i,1), (my_size + 1) * (send_rank + 1.5) );
+        }
+    }
+}
+
+//---------------------------------------------------------------------------//
+// RUN TESTS
+//---------------------------------------------------------------------------//
+TEST_F( TEST_CATEGORY, halo_test_1 )
+{ test1(true); }
+
+TEST_F( TEST_CATEGORY, halo_test_1_no_topo )
+{ test1(false); }
+
+TEST_F( TEST_CATEGORY, halo_test_2 )
+{ test2(true); }
+
+TEST_F( TEST_CATEGORY, halo_test_2_no_topo )
+{ test2(false); }
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Test