From d0019e5d32e99e7d679b3b773cf16b6f8e7da6f9 Mon Sep 17 00:00:00 2001
From: marsaev <32493426+marsaev@users.noreply.github.com>
Date: Wed, 8 Apr 2020 00:11:09 +0300
Subject: [PATCH] Revert "Removing references to Fermi, code specific to Fermi,
 and adding duplicates for Volta ready for tuning."

This reverts commit cb986b36dfdd1f23f01ddb7ee793ce2dc205c52d.
---
 base/include/csr_multiply_sm35.h              |    2 +-
 base/include/csr_multiply_sm70.h              |   82 -
 base/include/hash_containers_sm70.inl         | 1430 -----------
 base/include/ld_functions.h                   |    8 +
 base/include/memory_intrinsics.h              |   32 +
 base/include/sm_utils.inl                     |  168 ++
 base/include/strided_reduction.h              |   37 +-
 base/include/texture.h                        |    4 +
 base/src/classical/selectors/selector.cu      |   88 +-
 base/src/csr_multiply.cu                      |   24 +-
 base/src/csr_multiply_sm35.cu                 |   29 +-
 base/src/csr_multiply_sm70.cu                 | 2124 -----------------
 base/src/multiply.cu                          |   52 +-
 core/include/matrix_coloring/bfs.h            |   29 +
 core/include/matrix_coloring/coloring_utils.h |   33 +
 .../low_deg_coarse_A_generator.cu             |  267 ++-
 .../selectors/parallel_greedy_selector.cu     |    6 +
 core/src/classical/interpolators/distance2.cu |  281 ++-
 core/src/classical/interpolators/multipass.cu |  224 +-
 core/src/classical/strength/affinity.cu       |   36 +
 core/src/matrix_coloring/coloring_utils.cu    |    4 +
 .../matrix_coloring/greedy_min_max_2ring.cu   |   59 +
 core/src/matrix_coloring/min_max.cu           |   20 +
 core/src/matrix_coloring/min_max_2ring.cu     |    3 +
 core/src/scalers/nbinormalization.cu          |   57 +
 core/src/solvers/dense_lu_solver.cu           |   70 +-
 core/src/solvers/kaczmarz_solver.cu           |   37 +
 core/src/solvers/multicolor_dilu_solver.cu    |  357 ++-
 core/src/solvers/multicolor_ilu_solver.cu     |  127 +-
 examples/amgx_spmv_example/Makefile           |    2 +-
 30 files changed, 1885 insertions(+), 3807 deletions(-)
 delete mode 100644 base/include/csr_multiply_sm70.h
 delete mode 100644 base/include/hash_containers_sm70.inl
 delete mode 100644 base/src/csr_multiply_sm70.cu

diff --git a/base/include/csr_multiply_sm35.h b/base/include/csr_multiply_sm35.h
index 985b3ee0..40d85c8c 100644
--- a/base/include/csr_multiply_sm35.h
+++ b/base/include/csr_multiply_sm35.h
@@ -31,7 +31,7 @@ namespace amgx
 {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// SM35 implementation of the CSR_Multiply routines.
+// SM20 implementation of the CSR_Multiply routines.
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template< typename T_Config >
diff --git a/base/include/csr_multiply_sm70.h b/base/include/csr_multiply_sm70.h
deleted file mode 100644
index bb447ee2..00000000
--- a/base/include/csr_multiply_sm70.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2013-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-//#pragma once
-
-namespace amgx
-{
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// SM70 implementation of the CSR_Multiply routines.
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename T_Config >
-class CSR_Multiply_Sm70
-{};
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-class CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> > : public CSR_Multiply_Impl<TemplateConfig<AMGX_device, V, M, I> >
-{
-        typedef CSR_Multiply_Impl<TemplateConfig<AMGX_device, V, M, I> > Base;
-    public:
-        typedef typename Base::TConfig_d TConfig_d;
-        typedef typename Base::Matrix_d Matrix_d;
-        typedef typename Matrix_d::IVector IVector_d;
-        typedef typename Base::Value_type Value_type;
-        typedef typename Matrix_d::IVector IVector;
-        typedef typename Matrix_d::MVector MVector;
-
-    public:
-        // Create a workspace to run the product.
-        CSR_Multiply_Sm70( bool allocate_values = true, int grid_size = 128, int max_warp_count = 8, int gmem_size = 2048 );
-
-    protected:
-        // Count the number of non-zero elements. The callee is responsible for setting the work queue value.
-        void count_non_zeroes( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2 );
-        // Compute the sparsity of RAP_int + RAP_ext
-        void count_non_zeroes_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector<IVector> &RAP_ext_row_offsets, std::vector<IVector> &RAP_ext_col_indices, std::vector<MVector> &RAP_ext_values, std::vector<IVector> &RAP_ext_row_ids);
-
-        // Compute the ILU1 sparsity of A.
-        void count_non_zeroes_ilu1( const Matrix_d &A, Matrix_d &B );
-        // Compute offsets.
-        void compute_offsets( Matrix_d &C );
-        // Compute the sparsity of the product AxB.
-        void compute_sparsity( const Matrix_d &A, const Matrix_d &B, Matrix_d &C );
-        // Compute the ILU1 sparsity of A.
-        void compute_sparsity_ilu1( const Matrix_d &A, Matrix_d &B );
-        // Compute values.
-        void compute_values( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, int num_threads, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2 );
-
-        void compute_values_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector<IVector> &RAP_ext_row_offsets, std::vector<IVector> &RAP_ext_col_indices, std::vector<MVector> &RAP_ext_values, std::vector<IVector> &RAP_ext_row_ids, int num_threads);
-
-
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace amgx
-
diff --git a/base/include/hash_containers_sm70.inl b/base/include/hash_containers_sm70.inl
deleted file mode 100644
index 96333ab2..00000000
--- a/base/include/hash_containers_sm70.inl
+++ /dev/null
@@ -1,1430 +0,0 @@
-/* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <hash_index.inl>
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-static __constant__ unsigned c_hash_keys[] =
-{
-    3499211612,  581869302, 3890346734, 3586334585,
-    545404204,  4161255391, 3922919429,  949333985,
-    2715962298, 1323567403,  418932835, 2350294565,
-    1196140740,  809094426, 2348838239, 4264392720
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename Key_type, int SMEM_SIZE = 128, int NUM_HASH_FCTS = 4, int WARP_SIZE = 32 >
-class Hash_set
-{
-        // Associated index.
-        typedef Hash_index<Key_type, SMEM_SIZE, WARP_SIZE> Index;
-
-    protected:
-        // The size of the table (occupancy).
-        int m_smem_count, m_gmem_count;
-        // The keys stored in the hash table.
-        volatile Key_type *m_smem_keys, *m_gmem_keys;
-        // The size of the global memory buffer.
-        const int m_gmem_size;
-        // Is it ok?
-        bool m_fail;
-
-    public:
-        // Constructor.
-        __device__ __forceinline__ Hash_set( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, int gmem_size ) :
-            m_smem_count(0),
-            m_gmem_count(1),
-            m_smem_keys (smem_keys),
-            m_gmem_keys (gmem_keys),
-            m_gmem_size (gmem_size),
-            m_fail      (false)
-
-        {}
-
-        // Clear the table.
-        __device__ __forceinline__ void clear( bool skip_gmem = false );
-        // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value).
-        __device__ __forceinline__ int compute_size();
-        // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value).
-        __device__ __forceinline__ int compute_size_with_duplicates();
-        // Does the set contain those values?
-        __device__ __forceinline__ bool contains( Key_type key ) const;
-        // Find an index.
-        __device__ __forceinline__ int find_index( Key_type key, const Index &index, bool print_debug ) const;
-        // Has the process failed.
-        __device__ __forceinline__ bool has_failed() const { return m_fail; }
-        // Insert a key inside the set. If status is NULL, ignore failure.
-        __device__ __forceinline__ void insert( Key_type key, int *status );
-        // Load a set.
-        __device__ __forceinline__ void load( int count, const Key_type *keys, const int *pos );
-        // Load a set and use it as an index.
-        __device__ __forceinline__ void load_index( int count, const Key_type *keys, const int *pos, Index &index, bool print_debug );
-        // Store a set.
-        __device__ __forceinline__ void store( int count, Key_type *keys );
-        // Store a set.
-        __device__ __forceinline__ int  store_with_positions( Key_type *keys, int *pos );
-        // Store a set.
-        __device__ __forceinline__ int  store( Key_type *keys );
-};
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE>
-__device__ __forceinline__
-void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::clear( bool skip_gmem )
-{
-    int lane_id = utils::lane_id();
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        m_smem_keys[i_step * WARP_SIZE + lane_id] = -1;
-    }
-
-    m_smem_count = 0;
-
-    if ( skip_gmem || m_gmem_count == 0 )
-    {
-        m_gmem_count = 0;
-        return;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        m_gmem_keys[offset] = -1;
-    }
-
-    m_gmem_count = 0;
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE>
-__device__ __forceinline__
-int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::compute_size()
-{
-    m_smem_count += m_gmem_count;
-#pragma unroll
-
-    for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 )
-    {
-        m_smem_count += utils::shfl_xor( m_smem_count, offset );
-    }
-
-    m_gmem_count = utils::any( m_gmem_count > 0 );
-    return m_smem_count;
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE>
-__device__ __forceinline__
-int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::compute_size_with_duplicates()
-{
-    int lane_id = utils::lane_id();
-    // Count the number of keys in SMEM.
-    int sum = 0;
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        const int offset = i_step * WARP_SIZE + lane_id;
-        Key_type key = m_smem_keys[offset];
-        sum += __popc( utils::ballot( key != -1 ) );
-    }
-
-    // Is there any key in GMEM. If not, just quit.
-    m_gmem_count = utils::any(m_gmem_count > 0);
-
-    if ( !m_gmem_count )
-    {
-        return sum;
-    }
-
-    // Count the number of keys in GMEM.
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        Key_type key = m_gmem_keys[offset];
-        sum += __popc( utils::ballot( key != -1, utils::activemask() ) );
-    }
-
-    return sum;
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE>
-__device__ __forceinline__
-bool Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::contains( Key_type key ) const
-{
-    bool done = key == -1, found = false;
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            return found;
-        }
-
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1);
-
-        if ( !done )
-        {
-            Key_type stored_key = m_smem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                found = true;
-            }
-
-            if ( found || stored_key == -1 )
-            {
-                done = true;
-            }
-        }
-    }
-
-    const int num_bits = utils::bfind( m_gmem_size );
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            return found;
-        }
-
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
-
-        if ( !done )
-        {
-            Key_type stored_key = m_gmem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                found = true;
-            }
-
-            if ( found || stored_key == -1 )
-            {
-                done = true;
-            }
-        }
-    }
-
-    return found;
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::find_index( Key_type key, const Index &index, bool print_debug ) const
-{
-    int idx = -1;
-    bool done = key == -1;
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            return idx;
-        }
-
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1);
-        int result = index.find_smem(hash);
-
-        if ( !done )
-        {
-            Key_type stored_key = m_smem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                idx = result;
-                done = true;
-            }
-        }
-    }
-
-    const int num_bits = utils::bfind( m_gmem_size );
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            return idx;
-        }
-
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
-
-        if ( !done )
-        {
-            Key_type stored_key = m_gmem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                idx = index.find_gmem(hash);
-                done = true;
-            }
-        }
-    }
-
-    return idx;
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::insert( Key_type key, int *status )
-{
-    bool done = key == -1;
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            return;
-        }
-
-        bool candidate = false;
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1);
-
-        if ( !done )
-        {
-            Key_type stored_key = m_smem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                done = true;
-            }
-
-            candidate = stored_key == -1;
-
-            if ( candidate )
-            {
-                m_smem_keys[hash] = key;
-            }
-
-            if ( candidate && key == m_smem_keys[hash] ) // More than one candidate may have written to that slot.
-            {
-                m_smem_count++;
-                done = true;
-            }
-        }
-    }
-
-    const int num_bits = utils::bfind( m_gmem_size );
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            return;
-        }
-
-        bool candidate = false;
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
-
-        if ( !done )
-        {
-            Key_type stored_key = m_gmem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                done = true;
-            }
-
-            candidate = stored_key == -1;
-
-            if ( candidate )
-            {
-                m_gmem_keys[hash] = key;
-            }
-
-            if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot.
-            {
-                m_gmem_count++;
-                done = true;
-            }
-        }
-    }
-
-    if ( utils::all(done) )
-    {
-        return;
-    }
-
-    assert( status != NULL );
-
-    if ( utils::lane_id() == 0 )
-    {
-        *status = 1;
-    }
-
-    m_fail = true;
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::load( int count, const Key_type *keys, const int *pos )
-{
-    int lane_id = utils::lane_id();
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < count ; offset += WARP_SIZE )
-    {
-        Key_type key = keys[offset];
-        int idx = pos [offset];
-        // Where to store the item.
-        volatile Key_type *ptr = m_smem_keys;
-
-        if ( idx >= SMEM_SIZE )
-        {
-            ptr = m_gmem_keys;
-            m_gmem_count = 1;
-            idx -= SMEM_SIZE;
-        }
-
-        // Store the item.
-        ptr[idx] = key;
-    }
-
-    m_gmem_count = utils::any( m_gmem_count );
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::load_index( int count, const Key_type *keys, const int *pos, Index &index, bool print_debug )
-{
-#pragma unroll 4
-
-    for ( int offset = utils::lane_id() ; offset < count ; offset += WARP_SIZE  )
-    {
-        Key_type key = keys[offset];
-        int idx = pos [offset];
-        // Store the item.
-        volatile Key_type *ptr = m_smem_keys;
-
-        if ( idx >= SMEM_SIZE )
-        {
-            ptr = m_gmem_keys;
-            m_gmem_count = 1;
-            idx -= SMEM_SIZE;
-            index.set_gmem_index( idx, offset );
-        }
-
-        // Store the item.
-        ptr[idx] = key;
-    }
-
-    // Build the local index.
-    index.build_smem_index( m_smem_keys );
-    m_gmem_count = utils::any( m_gmem_count );
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store( int count, Key_type *keys )
-{
-    int lane_id = utils::lane_id();
-    int lane_mask_lt = utils::lane_mask_lt();
-    int warp_offset = 0;
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        const int offset = i_step * WARP_SIZE + lane_id;
-        Key_type key = m_smem_keys[offset];
-        int poll = utils::ballot( key != -1 );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-        }
-
-        warp_offset += __popc( poll );
-    }
-
-    m_gmem_count = utils::any( m_gmem_count > 0 );
-
-    if ( !m_gmem_count )
-    {
-        return;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        Key_type key = m_gmem_keys[offset];
-        int poll = utils::ballot( key != -1, utils::activemask() );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-        }
-
-        warp_offset += __popc( poll );
-    }
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store_with_positions( Key_type *keys, int *pos )
-{
-    int lane_id = utils::lane_id();
-    int lane_mask_lt = utils::lane_mask_lt();
-    int warp_offset = 0;
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        const int offset = i_step * WARP_SIZE + lane_id;
-        Key_type key = m_smem_keys[offset];
-        int poll = utils::ballot( key != -1 );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-            pos [dst_offset] = offset;
-        }
-
-        warp_offset += __popc( poll );
-    }
-
-    m_gmem_count = utils::any( m_gmem_count > 0 );
-
-    if ( !m_gmem_count )
-    {
-        return warp_offset;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        Key_type key = m_gmem_keys[offset];
-        int poll = utils::ballot( key != -1, utils::activemask() );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-            pos [dst_offset] = SMEM_SIZE + offset;
-        }
-
-        warp_offset += __popc( poll );
-    }
-
-    return warp_offset;
-}
-
-
-template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store( Key_type *keys )
-{
-    int lane_id = utils::lane_id();
-    int lane_mask_lt = utils::lane_mask_lt();
-    int warp_offset = 0;
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        const int offset = i_step * WARP_SIZE + lane_id;
-        Key_type key = m_smem_keys[offset];
-        int poll = utils::ballot( key != -1 );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-        }
-
-        warp_offset += __popc( poll );
-    }
-
-    m_gmem_count = utils::any( m_gmem_count > 0 );
-
-    if ( !m_gmem_count )
-    {
-        return warp_offset;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        Key_type key = m_gmem_keys[offset];
-        int poll = utils::ballot( key != -1, utils::activemask() );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-        }
-
-        warp_offset += __popc( poll );
-    }
-
-    return warp_offset;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-union Word { char b8[4]; int b32; };
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE = 128, int NUM_HASH_FCTS = 4, int WARP_SIZE = 32 >
-class Hash_map
-{
-    protected:
-        // The keys stored in the map.
-        volatile Key_type *m_smem_keys, *m_gmem_keys;
-        // Vote buffer for values.
-        volatile Word *m_smem_vote;
-        // Registers to store values.
-        T m_regs_vals[4];
-        // The values stored in the map.
-        T *m_gmem_vals;
-        // The size of the global memory buffer.
-        const int m_gmem_size;
-        // Is there any value in GMEM.
-        bool m_any_gmem;
-
-    public:
-        // Constructor.
-        __device__ __forceinline__
-        Hash_map( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, volatile Word *smem_vote, T *gmem_vals, int gmem_size ) :
-            m_smem_keys(smem_keys),
-            m_gmem_keys(gmem_keys),
-            m_smem_vote(smem_vote),
-            m_gmem_vals(gmem_vals),
-            m_gmem_size(gmem_size),
-            m_any_gmem (true)
-        {}
-
-        // Clear the table. It doesn't clear GMEM values.
-        __device__ __forceinline__ void clear();
-        // Clear the table. It also clears GMEM values (set them to 0).
-        __device__ __forceinline__ void clear_all();
-        // Insert a key/value inside the hash table.
-        __device__ __forceinline__ void insert( Key_type key, T a_value, T b_value, int *status );
-        // Insert a key/value inside the hash table.
-        __device__ __forceinline__ void insert_with_duplicates( Key_type key, T val, int *status );
-        // Load a set.
-        __device__ __forceinline__ void load( int count, const Key_type *keys, const int *pos );
-        // Store the map.
-        __device__ __forceinline__ void store( int count, T *vals );
-        // Store the map.
-        __device__ __forceinline__ void store( int count, Key_type *keys, T *vals );
-        // Store the map.
-        __device__ __forceinline__ void store_map_keys_scale_values( int count, const int *map, Key_type *keys, T alpha, T *vals );
-        // Store the map.
-        __device__ __forceinline__ void store_keys_scale_values( int count, Key_type *keys, T alpha, T *vals );
-        // Update a value in the table but do not insert if it doesn't exist.
-        __device__ __forceinline__ bool update( Key_type key, T value );
-
-    protected:
-        // Get the selected item in the register buffer.
-        __device__ __forceinline__ int get_selected( int hash ) const
-        {
-            return static_cast<int>(m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE]);
-        }
-
-        // Is it the selected item in the register buffer.
-        __device__ __forceinline__ bool is_selected( int hash, int lane_id ) const
-        {
-            return m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE] == reinterpret_cast<char &>(lane_id);
-        }
-
-        // Push my ID in the register buffer.
-        __device__ __forceinline__ void try_selection( int hash, int lane_id )
-        {
-            m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE] = reinterpret_cast<char &>(lane_id);
-        }
-};
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::clear()
-{
-    int lane_id = utils::lane_id();
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        m_smem_keys[i_step * WARP_SIZE + lane_id] = -1;
-    }
-
-#pragma unroll
-
-    for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
-    {
-        m_regs_vals[i_regs] = amgx::types::util<T>::get_zero();
-    }
-
-    if ( !m_any_gmem )
-    {
-        return;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        m_gmem_keys[offset] = -1;
-    }
-
-    m_any_gmem = false;
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::clear_all()
-{
-    int lane_id = utils::lane_id();
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        m_smem_keys[i_step * WARP_SIZE + lane_id] = -1;
-    }
-
-#pragma unroll
-
-    for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
-    {
-        m_regs_vals[i_regs] = amgx::types::util<T>::get_zero();
-    }
-
-    if ( !m_any_gmem )
-    {
-        return;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        m_gmem_keys[offset] =   -1;
-        m_gmem_vals[offset] = amgx::types::util<T>::get_zero();
-    }
-
-    m_any_gmem = false;
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::insert( Key_type key, T a_value, T b_value, int *status )
-{
-    const int lane_id = utils::lane_id();
-    bool done = key == -1;
-    m_smem_vote[lane_id].b32 = 0x20202020;
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( i_hash > 0 && utils::all(done) )
-        {
-            break;
-        }
-
-        bool candidate = false;
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1);
-
-        if ( !done )
-        {
-            Key_type stored_key = m_smem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                this->try_selection( hash, lane_id );
-                done = true;
-            }
-
-            candidate = stored_key == -1;
-
-            if ( candidate )
-            {
-                m_smem_keys[hash] = key;
-            }
-
-            if ( candidate && key == m_smem_keys[hash] )
-            {
-                this->try_selection( hash, lane_id );
-                done = true;
-            }
-        }
-    }
-
-    Word my_vote;
-    my_vote.b32 = m_smem_vote[lane_id].b32;
-#pragma unroll
-
-    for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
-    {
-        int my_src = my_vote.b8[i_regs];
-        T other_val = utils::shfl( b_value, my_src );
-
-        if ( my_src != WARP_SIZE )
-        {
-            m_regs_vals[i_regs] = m_regs_vals[i_regs] + a_value * other_val;
-        }
-    }
-
-    const int num_bits = utils::bfind( m_gmem_size );
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            return;
-        }
-
-        m_any_gmem = true;
-        bool candidate = false;
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
-
-        if ( !done )
-        {
-            Key_type stored_key = m_gmem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                m_gmem_vals[hash] = m_gmem_vals[hash] + a_value * b_value;
-                done = true;
-            }
-
-            candidate = stored_key == -1;
-
-            if ( candidate )
-            {
-                m_gmem_keys[hash] = key;
-            }
-
-            if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot.
-            {
-                m_gmem_vals[hash] = a_value * b_value;
-                done = true;
-            }
-        }
-    }
-
-    if ( status == NULL || utils::all(done) )
-    {
-        return;
-    }
-
-    if ( lane_id == 0 )
-    {
-        status[0] = 1;
-    }
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::insert_with_duplicates( Key_type key, T val, int *status )
-{
-    const int lane_id = utils::lane_id();
-    bool done = key == -1;
-    m_smem_vote[lane_id].b32 = 0x20202020;
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            break;
-        }
-
-        bool candidate = false;
-        bool maybe_in_conflict = false;
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1);
-
-        if ( !done )
-        {
-            Key_type stored_key = m_smem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                this->try_selection( hash, lane_id );
-                maybe_in_conflict = true;
-                done = true; // Is it really done???
-            }
-
-            candidate = stored_key == -1;
-
-            if ( candidate )
-            {
-                m_smem_keys[hash] = key;
-            }
-
-            if ( candidate && key == m_smem_keys[hash] )
-            {
-                this->try_selection( hash, lane_id );
-                maybe_in_conflict = true;
-                done = true;
-            }
-        }
-
-        // Fix conflicts.
-        bool in_conflict = maybe_in_conflict && !this->is_selected(hash, lane_id);
-
-        while ( utils::any( in_conflict ) )
-        {
-            int winner = in_conflict ? this->get_selected(hash) : WARP_SIZE;
-            T other_val = utils::shfl( val, winner );
-
-            if ( in_conflict )
-            {
-                this->try_selection(hash, lane_id);
-            }
-
-            if ( in_conflict && this->is_selected(hash, lane_id) )
-            {
-                val = val + other_val;
-                in_conflict = false;
-            }
-        }
-    }
-
-    Word my_vote;
-    my_vote.b32 = m_smem_vote[lane_id].b32;
-#pragma unroll
-
-    for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
-    {
-        int my_src = my_vote.b8[i_regs];
-        T other_val = utils::shfl( val, my_src );
-
-        if ( my_src != WARP_SIZE )
-        {
-            m_regs_vals[i_regs] = m_regs_vals[i_regs] + other_val;
-        }
-    }
-
-    const int num_bits = utils::bfind( m_gmem_size );
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            return;
-        }
-
-        m_any_gmem = true;
-        bool candidate = false;
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
-
-        if ( !done )
-        {
-            Key_type stored_key = m_gmem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                utils::atomic_add( &m_gmem_vals[hash], val );
-                done = true;
-            }
-
-            candidate = stored_key == -1;
-
-            if ( candidate )
-            {
-                m_gmem_keys[hash] = key;
-            }
-
-            if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot.
-            {
-                utils::atomic_add( &m_gmem_vals[hash], val );
-                done = true;
-            }
-        }
-    }
-
-    if ( status == NULL || utils::all(done) )
-    {
-        return;
-    }
-
-    if ( lane_id == 0 )
-    {
-        status[0] = 1;
-    }
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::load( int count, const Key_type *keys, const int *pos )
-{
-    int lane_id = utils::lane_id();
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < count ; offset += WARP_SIZE )
-    {
-        Key_type key = keys[offset];
-        int idx = pos [offset];
-        // Where to store the item.
-        volatile Key_type *ptr = m_smem_keys;
-
-        if ( idx >= SMEM_SIZE )
-        {
-            ptr = m_gmem_keys;
-            m_any_gmem = 1;
-            idx -= SMEM_SIZE;
-            m_gmem_vals[idx] = amgx::types::util<T>::get_zero();
-        }
-
-        // Store the item.
-        ptr[idx] = key;
-    }
-
-    m_any_gmem = utils::any( m_any_gmem );
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store( int count, T *vals )
-{
-    int lane_id = utils::lane_id();
-    int lane_mask_lt = utils::lane_mask_lt();
-    int warp_offset = 0;
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        const int offset = i_step * WARP_SIZE + lane_id;
-        Key_type key = m_smem_keys[offset];
-        int poll = utils::ballot( key != -1 );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            vals[dst_offset] = m_regs_vals[i_step];
-        }
-
-        warp_offset += __popc( poll );
-    }
-
-    if ( !m_any_gmem )
-    {
-        return;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        Key_type key = m_gmem_keys[offset];
-        int poll = utils::ballot( key != -1, utils::activemask() );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            vals[dst_offset] = m_gmem_vals[offset];
-        }
-
-        warp_offset += __popc( poll );
-    }
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store( int count, Key_type *keys, T *vals )
-{
-    int lane_id = utils::lane_id();
-    int lane_mask_lt = utils::lane_mask_lt();
-    int warp_offset = 0;
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        const int offset = i_step * WARP_SIZE + lane_id;
-        Key_type key = m_smem_keys[offset];
-        int poll = utils::ballot( key != -1 );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-            vals[dst_offset] = m_regs_vals[i_step];
-        }
-
-        warp_offset += __popc( poll );
-    }
-
-    if ( !m_any_gmem )
-    {
-        return;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        Key_type key = m_gmem_keys[offset];
-        int poll = utils::ballot( key != -1, utils::activemask() );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-            vals[dst_offset] = m_gmem_vals[offset];
-        }
-
-        warp_offset += __popc( poll );
-    }
-}
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store_map_keys_scale_values( int count, const int *map, Key_type *keys, T alpha, T *vals )
-{
-    int lane_id = utils::lane_id();
-    int lane_mask_lt = utils::lane_mask_lt();
-    int warp_offset = 0;
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        const int offset = i_step * WARP_SIZE + lane_id;
-        Key_type key = m_smem_keys[offset];
-        int poll = utils::ballot( key != -1 );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = map[key];
-            vals[dst_offset] = alpha * m_regs_vals[i_step];
-        }
-
-        warp_offset += __popc( poll );
-    }
-
-    if ( !m_any_gmem )
-    {
-        return;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        Key_type key = m_gmem_keys[offset];
-        int poll = utils::ballot( key != -1, utils::activemask() );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = map[key];
-            vals[dst_offset] = alpha * m_gmem_vals[offset];
-        }
-
-        warp_offset += __popc( poll );
-    }
-}
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store_keys_scale_values( int count, Key_type *keys, T alpha, T *vals )
-{
-    int lane_id = utils::lane_id();
-    int lane_mask_lt = utils::lane_mask_lt();
-    int warp_offset = 0;
-    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
-#pragma unroll
-
-    for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
-    {
-        const int offset = i_step * WARP_SIZE + lane_id;
-        Key_type key = m_smem_keys[offset];
-        int poll = utils::ballot( key != -1 );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-            vals[dst_offset] = alpha * m_regs_vals[i_step];
-        }
-
-        warp_offset += __popc( poll );
-    }
-
-    if ( !m_any_gmem )
-    {
-        return;
-    }
-
-#pragma unroll 4
-
-    for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
-    {
-        Key_type key = m_gmem_keys[offset];
-        int poll = utils::ballot( key != -1, utils::activemask() );
-
-        if ( poll == 0 )
-        {
-            continue;
-        }
-
-        int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
-
-        if ( key != -1 )
-        {
-            keys[dst_offset] = key;
-            vals[dst_offset] = alpha * m_gmem_vals[offset];
-        }
-
-        warp_offset += __popc( poll );
-    }
-}
-
-
-
-// ====================================================================================================================
-
-template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
-__device__ __forceinline__
-bool Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::update( Key_type key, T val )
-{
-    const int lane_id = utils::lane_id();
-    bool done = key == -1, found = false;
-    m_smem_vote[lane_id].b32 = 0x20202020;
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( i_hash > 0 && utils::all(done) )
-        {
-            break;
-        }
-
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1);
-
-        if ( !done )
-        {
-            Key_type stored_key = m_smem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                this->try_selection( hash, lane_id );
-                found = true;
-            }
-
-            done = found || stored_key == -1;
-        }
-    }
-
-    Word my_vote;
-    my_vote.b32 = m_smem_vote[lane_id].b32;
-#pragma unroll
-
-    for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
-    {
-        int my_src = my_vote.b8[i_regs];
-        T other_val = utils::shfl( val, my_src );
-
-        if ( my_src != WARP_SIZE )
-        {
-            m_regs_vals[i_regs] += other_val;
-        }
-    }
-
-    const int num_bits = utils::bfind( m_gmem_size );
-#pragma unroll
-
-    for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
-    {
-        if ( utils::all(done) )
-        {
-            return found;
-        }
-
-        unsigned ukey = reinterpret_cast<unsigned &>( key );
-        int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
-
-        if ( !done )
-        {
-            Key_type stored_key = m_gmem_keys[hash];
-
-            if ( stored_key == key )
-            {
-                m_gmem_vals[hash] += val;
-                found = true;
-            }
-
-            done = found || stored_key == -1;
-        }
-    }
-
-    return found;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/base/include/ld_functions.h b/base/include/ld_functions.h
index 6b33771f..615cbb9e 100644
--- a/base/include/ld_functions.h
+++ b/base/include/ld_functions.h
@@ -102,10 +102,18 @@ __device__  __inline__ int ld_cs(const int *address)
     return reg;
 }
 
+#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 350)
+template <class T>
+__device__ __inline T ldg(const T *address)
+{
+    return ld_cg(address);
+}
+#else
 template <class T>
 __device__ __inline T ldg(const T *address)
 {
     return __ldg(address);
 }
+#endif
 
 } //end namespace amgx
diff --git a/base/include/memory_intrinsics.h b/base/include/memory_intrinsics.h
index a2288f9f..d04e3a8c 100644
--- a/base/include/memory_intrinsics.h
+++ b/base/include/memory_intrinsics.h
@@ -36,7 +36,11 @@ static __device__ __inline__ int __load_all(const int *ptr) { int ret; asm volat
 static __device__ __inline__ int __load_global(const int *ptr) { int ret; asm volatile ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ int __load_streaming(const int *ptr) { int ret; asm volatile ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ int __load_lastuse(const int *ptr) { int ret; asm volatile ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#if __CUDA_ARCH__ >= 350
 static __device__ __inline__ int __load_nc(const int *ptr) { int ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#else
+static __device__ __inline__ int __load_nc(const int *ptr) { int ret; asm volatile ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
 static __device__ __inline__ int __load_uniform(const int *ptr) { int ret; asm volatile ("ld.global.s32.uniform %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
 
 
@@ -44,7 +48,11 @@ static __device__ __inline__ int2 __load_all(const int2 *ptr) { int2 ret; asm vo
 static __device__ __inline__ int2 __load_global(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cg.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ int2 __load_streaming(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cs.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ int2 __load_lastuse(const int2 *ptr) { int2 ret; asm volatile ("ld.global.lu.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+#if __CUDA_ARCH__ >= 350
 static __device__ __inline__ int2 __load_nc(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+#else
+static __device__ __inline__ int2 __load_nc(const int2 *ptr) { int2 ret; asm volatile ("ld.global.ca.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+#endif
 static __device__ __inline__ int2 __load_uniform(const int2 *ptr) { int2 ret; asm volatile ("ld.global.s32.v2.uniform {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
 
 
@@ -52,7 +60,11 @@ static __device__ __inline__ int4 __load_all(const int4 *ptr) { int4 ret; asm vo
 static __device__ __inline__ int4 __load_global(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ int4 __load_streaming(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ int4 __load_lastuse(const int4 *ptr) { int4 ret; asm volatile ("ld.global.lu.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+#if __CUDA_ARCH__ >= 350
 static __device__ __inline__ int4 __load_nc(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+#else
+static __device__ __inline__ int4 __load_nc(const int4 *ptr) { int4 ret; asm volatile ("ld.global.ca.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+#endif
 static __device__ __inline__ int4 __load_uniform(const int4 *ptr) { int4 ret; asm volatile ("ld.global.v4.s32.uniform {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
 
 
@@ -60,7 +72,11 @@ static __device__ __inline__ float __load_all(const float *ptr) { float ret; asm
 static __device__ __inline__ float __load_global(const float *ptr) { float ret; asm volatile ("ld.global.cg.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ float __load_streaming(const float *ptr) { float ret; asm volatile ("ld.global.cs.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ float __load_lastuse(const float *ptr) { float ret; asm volatile ("ld.global.lu.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+#if __CUDA_ARCH__ >= 350
+static __device__ __inline__ float __load_nc(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+#else
 static __device__ __inline__ float __load_nc(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
 static __device__ __inline__ float __load_uniform(const float *ptr) { float ret; asm volatile ("ld.global.f32.uniform %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
 
 
@@ -68,7 +84,11 @@ static __device__ __inline__ float2 __load_all(const float2 *ptr) { float2 ret;
 static __device__ __inline__ float2 __load_global(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cg.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ float2 __load_streaming(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cs.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ float2 __load_lastuse(const float2 *ptr) { float2 ret; asm volatile ("ld.global.lu.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+#if __CUDA_ARCH__ >= 350
 static __device__ __inline__ float2 __load_nc(const float2 *ptr) { float2 ret; asm volatile ("ld.global.nc.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+#else
+static __device__ __inline__ float2 __load_nc(const float2 *ptr) { float2 ret; asm volatile ("ld.global.ca.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+#endif
 static __device__ __inline__ float2 __load_uniform(const float2 *ptr) { float2 ret; asm volatile ("ld.global.v2.f32.uniform {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
 
 
@@ -76,7 +96,11 @@ static __device__ __inline__ float4 __load_all(const float4 *ptr) { float4 ret;
 static __device__ __inline__ float4 __load_global(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ float4 __load_streaming(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cs.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ float4 __load_lastuse(const float4 *ptr) { float4 ret; asm volatile ("ld.global.lu.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+#if __CUDA_ARCH__ >= 350
 static __device__ __inline__ float4 __load_nc(const float4 *ptr) { float4 ret; asm volatile ("ld.global.nc.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+#else
+static __device__ __inline__ float4 __load_nc(const float4 *ptr) { float4 ret; asm volatile ("ld.global.ca.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+#endif
 static __device__ __inline__ float4 __load_uniform(const float4 *ptr) { float4 ret; asm volatile ("ld.global.v4.f32.uniform {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
 
 
@@ -84,7 +108,11 @@ static __device__ __inline__ double __load_all(const double *ptr) { double ret;
 static __device__ __inline__ double __load_global(const double *ptr) { double ret; asm volatile ("ld.global.cg.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ double __load_streaming(const double *ptr) { double ret; asm volatile ("ld.global.cs.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ double __load_lastuse(const double *ptr) { double ret; asm volatile ("ld.global.lu.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+#if __CUDA_ARCH__ >= 350
 static __device__ __inline__ double __load_nc(const double *ptr) { double ret; asm volatile ("ld.global.nc.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+#else
+static __device__ __inline__ double __load_nc(const double *ptr) { double ret; asm volatile ("ld.global.ca.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
 static __device__ __inline__ double __load_uniform(const double *ptr) { double ret; asm volatile ("ld.global.f64.uniform %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
 
 
@@ -92,5 +120,9 @@ static __device__ __inline__ double2 __load_all(const double2 *ptr) { double2 re
 static __device__ __inline__ double2 __load_global(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cg.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ double2 __load_streaming(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cs.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
 static __device__ __inline__ double2 __load_lastuse(const double2 *ptr) { double2 ret; asm volatile ("ld.global.lu.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+#if __CUDA_ARCH__ >= 350
 static __device__ __inline__ double2 __load_nc(const double2 *ptr) { double2 ret; asm volatile ("ld.global.nc.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+#else
+static __device__ __inline__ double2 __load_nc(const double2 *ptr) { double2 ret; asm volatile ("ld.global.ca.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+#endif
 static __device__ __inline__ double2 __load_uniform(const double2 *ptr) { double2 ret; asm volatile ("ld.global.v2.f64.uniform {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
diff --git a/base/include/sm_utils.inl b/base/include/sm_utils.inl
index 7b71854b..5d07ae45 100644
--- a/base/include/sm_utils.inl
+++ b/base/include/sm_utils.inl
@@ -275,29 +275,41 @@ static __device__ __forceinline__ void load_vec4( double (&u)[4], const double *
 // ====================================================================================================================
 static __device__ __forceinline__ unsigned int ballot(int p, unsigned int mask = DEFAULT_MASK)
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __ballot_sync(mask, p);
 #else
     return __ballot(p);   
 #endif
+#else
+    return 0;
+#endif
 }
 
 static __device__ __forceinline__ unsigned int any(int p, unsigned int mask = DEFAULT_MASK)
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __any_sync(mask, p);
 #else
     return __any(p);   
 #endif
+#else
+    return 0;
+#endif
 }
 
 static __device__ __forceinline__ unsigned int all(int p, unsigned int mask = DEFAULT_MASK)
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __all_sync(mask, p);
 #else
     return __all(p);   
 #endif
+#else
+    return 0;
+#endif
 }
 
 static __device__ __forceinline__ unsigned int activemask()
@@ -323,25 +335,34 @@ static __device__ __forceinline__ void syncwarp(unsigned int mask = 0xffffffff)
 // ====================================================================================================================
 static __device__ __forceinline__ int shfl( int r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __shfl_sync( mask, r, lane, bound );
 #else
     return __shfl( r, lane, bound );
 #endif
+#else // __CUDA_ARCH__
+    return 0.0f;
+#endif
 }
 
 
 static __device__ __forceinline__ float shfl( float r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __shfl_sync( mask, r, lane, bound );
 #else
     return __shfl( r, lane, bound );
 #endif
+#else // __CUDA_ARCH__
+    return 0.0f;
+#endif
 }
 
 static __device__ __forceinline__ double shfl( double r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     int hi = __shfl_sync(mask, __double2hiint(r), lane, bound );
     int lo = __shfl_sync(mask, __double2loint(r), lane, bound );
@@ -351,10 +372,14 @@ static __device__ __forceinline__ double shfl( double r, int lane, int bound = w
     int lo = __shfl( __double2loint(r), lane, bound );
     return __hiloint2double( hi, lo );
 #endif
+#else
+    return 0.0;
+#endif
 }
 
 static __device__ __forceinline__ cuComplex shfl( cuComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     float re = __shfl_sync( mask, cuCrealf(r), lane, bound );
     float im = __shfl_sync( mask, cuCimagf(r), lane, bound );
@@ -364,36 +389,52 @@ static __device__ __forceinline__ cuComplex shfl( cuComplex r, int lane, int bou
     float im = __shfl( cuCimagf(r), lane, bound );
     return make_cuComplex(re, im);
 #endif
+#else
+    return amgx::types::util<cuComplex>::get_zero();
+#endif
 }
 
 static __device__ __forceinline__ cuDoubleComplex shfl( cuDoubleComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
     double re = shfl( cuCreal(r), lane, mask, bound );
     double im = shfl( cuCimag(r), lane, mask, bound );
     return make_cuDoubleComplex( re, im );
+#else
+    return amgx::types::util<cuDoubleComplex>::get_zero();
+#endif
 }
 
 static __device__ __forceinline__ int shfl_xor( int r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __shfl_xor_sync( mask, r, lane_mask, bound );
 #else
     return __shfl_xor( r, lane_mask, bound );
 #endif
+#else
+    return 0.0f;
+#endif
 }
 
 
 static __device__ __forceinline__ float shfl_xor( float r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __shfl_xor_sync( mask, r, lane_mask, bound );
 #else
     return __shfl_xor( r, lane_mask, bound );
 #endif
+#else
+    return 0.0f;
+#endif
 }
 
 static __device__ __forceinline__ double shfl_xor( double r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     int hi = __shfl_xor_sync( mask, __double2hiint(r), lane_mask, bound );
     int lo = __shfl_xor_sync( mask, __double2loint(r), lane_mask, bound );
@@ -403,10 +444,14 @@ static __device__ __forceinline__ double shfl_xor( double r, int lane_mask, int
     int lo = __shfl_xor( __double2loint(r), lane_mask, bound );
     return __hiloint2double( hi, lo );
 #endif
+#else
+    return 0.0;
+#endif
 }
 
 static __device__ __forceinline__ cuComplex shfl_xor( cuComplex r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     float re = __shfl_xor_sync( mask, cuCrealf(r), lane_mask, bound );
     float im = __shfl_xor_sync( mask, cuCimagf(r), lane_mask, bound );
@@ -416,35 +461,51 @@ static __device__ __forceinline__ cuComplex shfl_xor( cuComplex r, int lane_mask
     float im = __shfl_xor( cuCimagf(r), lane_mask, bound );
     return make_cuComplex(re, im);
 #endif
+#else
+    return amgx::types::util<cuComplex>::get_zero();
+#endif
 }
 
 static __device__ __forceinline__ cuDoubleComplex shfl_xor( cuDoubleComplex r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
     double re = shfl_xor( cuCreal(r), lane_mask, mask, bound );
     double im = shfl_xor( cuCimag(r), lane_mask, mask, bound );
     return make_cuDoubleComplex( re, im );
+#else
+    return amgx::types::util<cuDoubleComplex>::get_zero();
+#endif
 }
 
 static __device__ __forceinline__ int shfl_down( int r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __shfl_down_sync( mask, r, offset, bound );
 #else
     return __shfl_down( r, offset, bound );
 #endif
+#else
+    return 0.0f;
+#endif
 }
 
 static __device__ __forceinline__ float shfl_down( float r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __shfl_down_sync( mask, r, offset, bound );
 #else
     return __shfl_down( r, offset, bound );
 #endif
+#else
+    return 0.0f;
+#endif
 }
 
 static __device__ __forceinline__ double shfl_down( double r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     int hi = __shfl_down_sync( mask, __double2hiint(r), offset, bound );
     int lo = __shfl_down_sync( mask, __double2loint(r), offset, bound );
@@ -454,10 +515,14 @@ static __device__ __forceinline__ double shfl_down( double r, int offset, int bo
     int lo = __shfl_down( __double2loint(r), offset, bound );
     return __hiloint2double( hi, lo );
 #endif
+#else
+    return 0.0;
+#endif
 }
 
 static __device__ __forceinline__ cuComplex shfl_down( cuComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     float re = __shfl_down_sync( mask, cuCrealf(r), lane, bound );
     float im = __shfl_down_sync( mask, cuCimagf(r), lane, bound );
@@ -467,36 +532,52 @@ static __device__ __forceinline__ cuComplex shfl_down( cuComplex r, int lane, in
     float im = __shfl_down( cuCimagf(r), lane, bound );
     return make_cuComplex(re, im);
 #endif
+#else
+    return amgx::types::util<cuComplex>::get_zero();
+#endif
 }
 
 static __device__ __forceinline__ cuDoubleComplex shfl_down( cuDoubleComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
     double re = shfl_down( cuCreal(r), lane, bound );
     double im = shfl_down( cuCimag(r), lane, bound );
     return make_cuDoubleComplex( re, im );
+#else
+    return amgx::types::util<cuDoubleComplex>::get_zero();
+#endif
 }
 
 
 static __device__ __forceinline__ int shfl_up( int r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __shfl_up_sync( mask, r, offset, bound );
 #else
     return __shfl_up( r, offset, bound );
 #endif
+#else
+    return 0.0f;
+#endif
 }
 
 static __device__ __forceinline__ float shfl_up( float r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     return __shfl_up_sync( mask, r, offset, bound );
 #else
     return __shfl_up( r, offset, bound );
 #endif
+#else
+    return 0.0f;
+#endif
 }
 
 static __device__ __forceinline__ double shfl_up( double r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     int hi = __shfl_up_sync( mask, __double2hiint(r), offset, bound );
     int lo = __shfl_up_sync( mask, __double2loint(r), offset, bound );
@@ -506,10 +587,14 @@ static __device__ __forceinline__ double shfl_up( double r, int offset, int boun
     int lo = __shfl_up( __double2loint(r), offset, bound );
     return __hiloint2double( hi, lo );
 #endif
+#else
+    return 0.0;
+#endif
 }
 
 static __device__ __forceinline__ cuComplex shfl_up( cuComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
 #if CUDART_VERSION >= 9000
     float re = __shfl_up_sync( mask, cuCrealf(r), lane, bound );
     float im = __shfl_up_sync( mask, cuCimagf(r), lane, bound );
@@ -519,13 +604,20 @@ static __device__ __forceinline__ cuComplex shfl_up( cuComplex r, int lane, int
     float im = __shfl_up( cuCimagf(r), lane, bound );
     return make_cuComplex(re, im);
 #endif
+#else
+    return amgx::types::util<cuComplex>::get_zero();
+#endif
 }
 
 static __device__ __forceinline__ cuDoubleComplex shfl_up( cuDoubleComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK )
 {
+#if __CUDA_ARCH__ >= 300
     double re = shfl_up( cuCreal(r), lane, bound );
     double im = shfl_up( cuCimag(r), lane, bound );
     return make_cuDoubleComplex( re, im );
+#else
+    return amgx::types::util<cuDoubleComplex>::get_zero();
+#endif
 }
 
 // ====================================================================================================================
@@ -538,6 +630,8 @@ struct Add
     static __device__ __forceinline__ Value_type eval( Value_type x, Value_type y ) { return x + y; }
 };
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
 template< int NUM_THREADS_PER_ITEM, int WarpSize >
 struct Warp_reduce_pow2
 {
@@ -579,6 +673,51 @@ struct Warp_reduce_linear
     }
 };
 
+#else
+
+template< int NUM_THREADS_PER_ITEM, int WarpSize >
+struct Warp_reduce_pow2
+{
+    template< typename Operator, typename Value_type >
+    static __device__ __inline__ Value_type execute( volatile Value_type *smem, Value_type x )
+    {
+        int my_lane_id = utils::lane_id();
+#pragma unroll
+
+        for ( int offset = WarpSize / 2 ; offset >= NUM_THREADS_PER_ITEM ; offset >>= 1 )
+            if ( my_lane_id < offset )
+            {
+                x = Operator::eval( x, smem[threadIdx.x + offset] );
+                amgx::types::util<Value_type>::volcast(x, smem + threadIdx.x);
+            }
+
+        return x;
+    }
+};
+
+template< int NUM_THREADS_PER_ITEM, int WarpSize >
+struct Warp_reduce_linear
+{
+    template< typename Operator, typename Value_type >
+    static __device__ __inline__ Value_type execute( volatile Value_type *smem, Value_type x )
+    {
+        const int NUM_STEPS = WarpSize / NUM_THREADS_PER_ITEM;
+        int my_lane_id = utils::lane_id();
+#pragma unroll
+
+        for ( int i = 1 ; i < NUM_STEPS ; ++i )
+            if ( my_lane_id < NUM_THREADS_PER_ITEM )
+            {
+                x = Operator::eval( x, smem[threadIdx.x + i * NUM_THREADS_PER_ITEM] );
+                amgx::types::util<Value_type>::volcast(x, smem + threadIdx.x);
+            }
+
+        return x;
+    }
+};
+
+#endif
+
 // ====================================================================================================================
 
 template< int NUM_THREADS_PER_ITEM, int WarpSize = 32 >
@@ -622,12 +761,41 @@ struct Warp_reduce<15, WarpSize> : public Warp_reduce_linear<15, WarpSize> {};
 
 // ====================================================================================================================
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
 template< int NUM_THREADS_PER_ITEM, typename Operator, typename Value_type >
 static __device__ __forceinline__ Value_type warp_reduce( Value_type x )
 {
     return Warp_reduce<NUM_THREADS_PER_ITEM>::template execute<Operator>( x );
 }
 
+#else
+
+template< int NUM_THREADS_PER_ITEM, typename Operator, typename Value_type >
+static __device__ __forceinline__ Value_type warp_reduce( volatile Value_type *smem, Value_type x )
+{
+    return Warp_reduce<NUM_THREADS_PER_ITEM>::template execute<Operator>( smem, x );
+}
+
+template< int NUM_THREADS_PER_ITEM, typename Value_type, int WarpSize = 32 >
+static __device__ __forceinline__ Value_type warp_reduce_sum(volatile Value_type *smem, Value_type x)
+{
+    const int NUM_STEPS = WarpSize / NUM_THREADS_PER_ITEM;
+    int my_lane_id = utils::lane_id();
+#pragma unroll
+
+    for (int i = 1; i < NUM_STEPS; ++i)
+        if (my_lane_id < NUM_THREADS_PER_ITEM)
+        {
+            x = x + amgx::types::util<Value_type>::volcast(smem[threadIdx.x + i * NUM_THREADS_PER_ITEM]);
+            amgx::types::util<Value_type>::volcast(x, smem + threadIdx.x);
+        }
+
+    return x;
+}
+
+#endif
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace utils
diff --git a/base/include/strided_reduction.h b/base/include/strided_reduction.h
index 2a64541a..df5ca976 100644
--- a/base/include/strided_reduction.h
+++ b/base/include/strided_reduction.h
@@ -99,7 +99,12 @@ template<int I> struct is_pow2   { static const int value = (I & (I - 1)) == 0;
 // detect arch
 
 #define strided_reduction_arch_KEPLER 2
+#define strided_reduction_arch_FERMI  1
+#if __CUDA_ARCH__ >= 350
 #define strided_reduction_ARCH strided_reduction_arch_KEPLER
+#else
+#define strided_reduction_ARCH strided_reduction_arch_FERMI
+#endif
 
 //ops
 struct op_sum
@@ -391,6 +396,34 @@ struct warputil<8, strided_reduction_arch_KEPLER>
     }
 };
 
+template<int SIZE_IN_BYTES>
+struct warputil<SIZE_IN_BYTES, strided_reduction_arch_FERMI>
+{
+    template<int STRIDE, int NVALS, int CTA_SIZE, int WARP_SIZE, class T, class OP> SR_SDF void warp_reduce_stride(T &value, const OP &op)
+    {
+        __shared__ volatile T s_buf[CTA_SIZE + WARP_SIZE];
+        const int laneId = utils::lane_id();
+        {
+            s_buf[threadIdx.x] = value;
+#pragma unroll
+            for (int i = STRIDE; i < NVALS; i *= 2)
+            {
+                T tmp = s_buf[threadIdx.x + i];
+
+                if (laneId + i < WARP_SIZE)
+                {
+                    value = op.compute(value,  tmp);
+                }
+
+                s_buf[threadIdx.x] = value;
+            }
+
+            if (STRIDE & (STRIDE - 1) == 0) { value = s_buf[threadIdx.x - laneId + (threadIdx.x & (STRIDE - 1))]; }
+        }
+    }
+};
+
+
 /////////////////////////////////
 // STRUDED REDUCTION KERNEL
 /////////////////////////////////
@@ -399,10 +432,10 @@ struct warputil<8, strided_reduction_arch_KEPLER>
 
 template<int STRIDE, int CTA_SIZE, int WARP_SIZE, int BLOCKS_PER_THREAD, class OP, class T, class V, class TRANSFORM>
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 2 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 2 )
+__launch_bounds__( CTA_SIZE, 1 )
 #endif
 void strided_reduction(const T *X, const int N, V *sums, const TRANSFORM tx = TRANSFORM(), const OP op = OP())
 {
diff --git a/base/include/texture.h b/base/include/texture.h
index b11dd4e2..b4913b38 100644
--- a/base/include/texture.h
+++ b/base/include/texture.h
@@ -38,7 +38,11 @@ namespace amgx
 
 template <typename T_ELEM> __inline__ __device__ T_ELEM __cachingLoad(const T_ELEM *addr)
 {
+#if __CUDA_ARCH__ < 350
+    return *addr;
+#else
     return __ldg(addr);
+#endif
 }
 
 }
diff --git a/base/src/classical/selectors/selector.cu b/base/src/classical/selectors/selector.cu
index 9882489a..0a3fccd4 100644
--- a/base/src/classical/selectors/selector.cu
+++ b/base/src/classical/selectors/selector.cu
@@ -119,36 +119,36 @@ void resolve_boundary(const IndexType *offsets, const IndexType *column_indices,
     }
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace selector_sm35
+namespace selector_sm20
 {
 
 #include <sm_utils.inl>
-#include <hash_containers_sm35.inl> // Included inside the namespace to solve name colisions.
+#include <hash_containers_sm20.inl> // Included inside the namespace to solve name colisions.
 
-__device__ __forceinline__ int get_work( int *queue, int warp_id )
+__device__ __forceinline__ int get_work( volatile int *offsets, int *queue, int warp_id )
 {
-    int offset = -1;
-
     if ( utils::lane_id() == 0 )
     {
-        offset = atomicAdd( queue, 1 );
+        offsets[warp_id] = atomicAdd( queue, 1 );
     }
 
-    return utils::shfl( offset, 0 );
+    return offsets[warp_id];
 }
 
-}
+} // namespace distance2_sm20
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace selector_sm70
+namespace selector_sm35
 {
 
 #include <sm_utils.inl>
-#include <hash_containers_sm70.inl> // Included inside the namespace to solve name colisions.
+#include <hash_containers_sm35.inl> // Included inside the namespace to solve name colisions.
 
 __device__ __forceinline__ int get_work( int *queue, int warp_id )
 {
+#if __CUDA_ARCH__ >= 300
     int offset = -1;
 
     if ( utils::lane_id() == 0 )
@@ -157,10 +157,14 @@ __device__ __forceinline__ int get_work( int *queue, int warp_id )
     }
 
     return utils::shfl( offset, 0 );
+#else
+    return 0;
+#endif
 }
 
 }
 
+
 namespace selector
 {
 
@@ -431,22 +435,30 @@ compute_c_hat_kernel( int A_num_rows,
     __shared__ volatile int s_b_row_ids[CTA_SIZE];
     // The hash keys stored in shared memory.
     __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
+#if __CUDA_ARCH__ >= 300
+#else
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id( );
     const int lane_id = utils::lane_id( );
     // First threads load the row IDs of A needed by the CTA...
     int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
-#if __CUDA_ARCH__ >= 700
-    amgx::classical::selector_sm70::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-#else
+#if __CUDA_ARCH__ >= 300
     amgx::classical::selector_sm35::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
+#else
+    amgx::classical::selector_sm20::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
 #endif
     // Loop over rows of A.
-#if __CUDA_ARCH__ >= 700
-    for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm70::get_work( wk_work_queue, warp_id ) )
-#else
+#if __CUDA_ARCH__ >= 300
+
     for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm35::get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm20::get_work( s_offsets, wk_work_queue, warp_id ) )
 #endif
     {
         // Skip fine rows.
@@ -461,6 +473,7 @@ compute_c_hat_kernel( int A_num_rows,
         set.clear();
         // Load the range of the row.
         __syncthreads();
+#if __CUDA_ARCH__ >= 300
         int a_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -470,6 +483,16 @@ compute_c_hat_kernel( int A_num_rows,
 
         int a_col_begin = utils::shfl( a_col_tmp, 0 );
         int a_col_end   = utils::shfl( a_col_tmp, 1 );
+#else
+
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
+
+        int a_col_begin = s_rows[2 * warp_id + 0];
+        int a_col_end   = s_rows[2 * warp_id + 1];
+#endif
         __syncthreads();
 
         // _iterate over the columns of A to build C_hat.
@@ -592,6 +615,13 @@ compute_c_hat_kernel( int A_num_rows,
     __shared__ volatile int s_b_row_ids[CTA_SIZE];
     // The hash keys stored in shared memory.
     __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
+#if __CUDA_ARCH__ >= 300
+#else
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id( );
     const int lane_id = utils::lane_id( );
@@ -601,16 +631,17 @@ compute_c_hat_kernel( int A_num_rows,
     // First threads load the row IDs of A needed by the CTA...
     int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
-#if __CUDA_ARCH__ >= 700
-    amgx::classical::selector_sm70::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-#else
+#if __CUDA_ARCH__ >= 300
     amgx::classical::selector_sm35::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
+#else
+    amgx::classical::selector_sm20::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
 #endif
     // Loop over rows of A.
-#if __CUDA_ARCH__ >= 700
-    for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm70::get_work( wk_work_queue, warp_id ) )
-#else
+#if __CUDA_ARCH__ >= 300
+
     for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm35::get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm20::get_work( s_offsets, wk_work_queue, warp_id ) )
 #endif
     {
         // Skip fine rows.
@@ -624,6 +655,7 @@ compute_c_hat_kernel( int A_num_rows,
         // Clear the set.
         set.clear();
         // Load the range of the row.
+#if __CUDA_ARCH__ >= 300
         int a_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -633,6 +665,16 @@ compute_c_hat_kernel( int A_num_rows,
 
         int a_col_begin = utils::shfl( a_col_tmp, 0 );
         int a_col_end   = utils::shfl( a_col_tmp, 1 );
+#else
+
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
+
+        int a_col_begin = s_rows[2 * warp_id + 0];
+        int a_col_end   = s_rows[2 * warp_id + 1];
+#endif
 
         // _iterate over the columns of A to build C_hat.
         for ( int a_col_it = a_col_begin + lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE )
diff --git a/base/src/csr_multiply.cu b/base/src/csr_multiply.cu
index 616fcb43..59ca324f 100644
--- a/base/src/csr_multiply.cu
+++ b/base/src/csr_multiply.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2020, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2013-2017, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -26,8 +26,8 @@
  */
 
 #include <csr_multiply.h>
+#include <csr_multiply_sm20.h>
 #include <csr_multiply_sm35.h>
-#include <csr_multiply_sm70.h>
 #include <util.h>
 #include <device_properties.h>
 #include <amgx_cusparse.h>
@@ -43,16 +43,16 @@ void *CSR_Multiply<TemplateConfig<AMGX_device, V, M, I> >::csr_workspace_create(
     cudaDeviceProp props = getDeviceProperties();
     int arch = 10 * props.major + props.minor;
 
-    if ( arch >= 70 )
-    {
-        return new CSR_Multiply_Sm70<TConfig_d>();
-    }
     if ( arch >= 35 )
     {
         return new CSR_Multiply_Sm35<TConfig_d>();
     }
+    else if ( arch >= 20 )
+    {
+        return new CSR_Multiply_Sm20<TConfig_d>();
+    }
 
-    FatalError( "CSR_Multiply: Unsupported architecture. It requires a Kepler GPU or newer!!!", AMGX_ERR_NOT_SUPPORTED_BLOCKSIZE );
+    FatalError( "CSR_Multiply: Unsupported architecture. It requires a Fermi GPU or newer!!!", AMGX_ERR_NOT_SUPPORTED_BLOCKSIZE );
 }
 
 // ====================================================================================================================
@@ -64,20 +64,20 @@ void *CSR_Multiply<TemplateConfig<AMGX_device, V, M, I> >::csr_workspace_create(
     cudaDeviceProp props = getDeviceProperties();
     int arch = 10 * props.major + props.minor;
 
-    if ( arch >= 70 )
+    if ( arch >= 35 )
     {
-        CSR_Multiply_Sm70<TConfig_d> *wk = new CSR_Multiply_Sm70<TConfig_d>();
+        CSR_Multiply_Sm35<TConfig_d> *wk = new CSR_Multiply_Sm35<TConfig_d>();
         wk->set_max_attempts(max_attempts);
         return wk;
     }
-    if ( arch >= 35 )
+    else if ( arch >= 20 )
     {
-        CSR_Multiply_Sm35<TConfig_d> *wk = new CSR_Multiply_Sm35<TConfig_d>();
+        CSR_Multiply_Sm20<TConfig_d> *wk = new CSR_Multiply_Sm20<TConfig_d>();
         wk->set_max_attempts(max_attempts);
         return wk;
     }
 
-    FatalError( "CSR_Multiply: Unsupported architecture. It requires a Kepler GPU or newer!!!", AMGX_ERR_NOT_SUPPORTED_BLOCKSIZE );
+    FatalError( "CSR_Multiply: Unsupported architecture. It requires a Fermi GPU or newer!!!", AMGX_ERR_NOT_SUPPORTED_BLOCKSIZE );
 }
 
 // ====================================================================================================================
diff --git a/base/src/csr_multiply_sm35.cu b/base/src/csr_multiply_sm35.cu
index b86d76cc..c8376ec5 100644
--- a/base/src/csr_multiply_sm35.cu
+++ b/base/src/csr_multiply_sm35.cu
@@ -31,7 +31,6 @@
 #include <util.h>
 #include <csr_multiply.h>
 #include <csr_multiply_sm35.h>
-#include <device_properties.h>
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -57,6 +56,7 @@ void flag_halo_rows(int *row_ids, int size, int *flagArray, int neighbor, int gl
 
 __device__ __forceinline__ int get_work( int *queue, int warp_id )
 {
+#if __CUDA_ARCH__ >= 300
     int offset = -1;
 
     if ( utils::lane_id() == 0 )
@@ -65,6 +65,9 @@ __device__ __forceinline__ int get_work( int *queue, int warp_id )
     }
 
     return utils::shfl( offset, 0 );
+#else
+    return 1 << 28;
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -88,6 +91,7 @@ count_non_zeroes_kernel( const int A_num_rows,
                          int *wk_work_queue,
                          int *wk_status )
 {
+#if __CUDA_ARCH__ >= 350
     const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
     // The hash keys stored in shared memory.
     __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
@@ -229,6 +233,8 @@ count_non_zeroes_kernel( const int A_num_rows,
             set.store( count, &C_cols[c_col_it] );
         }
     }
+
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -252,6 +258,7 @@ count_non_zeroes_kernel( const int A_num_rows,
                          int *wk_work_queue,
                          int *wk_status )
 {
+#if __CUDA_ARCH__ >= 350
     const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
     const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
     // The hash keys stored in shared memory.
@@ -399,6 +406,8 @@ count_non_zeroes_kernel( const int A_num_rows,
             set.store( count, &C_cols[c_col_it] );
         }
     }
+
+#endif
 }
 
 template <int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE, bool COUNT_ONLY >
@@ -483,6 +492,7 @@ count_non_zeroes_RAP_ext_kernel( const int RAP_int_num_rows,
                                  int num_neighbors,
                                  int global_id )
 {
+#if __CUDA_ARCH__ >= 350
     const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
     // The hash keys stored in shared memory.
     __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE];
@@ -571,6 +581,8 @@ count_non_zeroes_RAP_ext_kernel( const int RAP_int_num_rows,
             set.store( count, &RAP_cols[rap_col_it] );
         }
     }
+
+#endif
 }
 
 
@@ -608,6 +620,7 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows,
                               int *wk_work_queue,
                               int *wk_status )
 {
+#if __CUDA_ARCH__ >= 350
     const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
     // Tables to broadcast values.
     __shared__ volatile int s_b_rows[CTA_SIZE], s_b_colors[CTA_SIZE];
@@ -768,6 +781,8 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows,
             set.store( count, &C_cols[c_col_it] );
         }
     }
+
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -786,6 +801,7 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows,
                               int *wk_work_queue,
                               int *wk_status )
 {
+#if __CUDA_ARCH__ >= 350
     const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
     const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
     // Tables to broadcast values.
@@ -959,6 +975,8 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows,
             set.store( count, &C_cols[c_col_it] );
         }
     }
+
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -986,6 +1004,7 @@ compute_values_kernel( const int A_num_rows,
                        int *wk_work_queue,
                        int *wk_status )
 {
+#if __CUDA_ARCH__ >= 350
     const int NUM_WARPS = CTA_SIZE / 32;
     // The hash keys stored in shared memory.
     __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE];
@@ -1114,6 +1133,8 @@ compute_values_kernel( const int A_num_rows,
 
         map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] );
     }
+
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1141,6 +1162,7 @@ compute_values_kernel( const int A_num_rows,
                        int *wk_work_queue,
                        int *wk_status )
 {
+#if __CUDA_ARCH__ >= 350
     const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
     const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
     // The hash keys stored in shared memory.
@@ -1277,6 +1299,8 @@ compute_values_kernel( const int A_num_rows,
 
         map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] );
     }
+
+#endif
 }
 
 
@@ -1301,6 +1325,7 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows,
                                int num_neighbors,
                                int *wk_status )
 {
+#if __CUDA_ARCH__ >= 350
     const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
     // The hash keys stored in shared memory.
     __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE];
@@ -1366,6 +1391,8 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows,
 
         map.store( count, &RAP_cols[rap_col_it], &RAP_vals[rap_col_it] );
     }
+
+#endif
 }
 
 
diff --git a/base/src/csr_multiply_sm70.cu b/base/src/csr_multiply_sm70.cu
deleted file mode 100644
index 7d4063d5..00000000
--- a/base/src/csr_multiply_sm70.cu
+++ /dev/null
@@ -1,2124 +0,0 @@
-/* Copyright (c) 2013-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <cassert>
-#include <iostream>
-#include <thrust/scan.h>
-#include <util.h>
-#include <csr_multiply.h>
-#include <csr_multiply_sm70.h>
-#include <device_properties.h>
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace csr_multiply_sm70
-{
-
-#include <amgx_types/util.h>
-
-#include <sm_utils.inl>
-#include <hash_containers_sm70.inl> // Included inside the namespace to solve name colisions.
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__global__
-void flag_halo_rows(int *row_ids, int size, int *flagArray, int neighbor, int global_id)
-{
-    for (int tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < size; tidx += blockDim.x * gridDim.x)
-    {
-        int row_id = row_ids[tidx];
-        flagArray[row_id] = tidx;
-    }
-}
-
-__device__ __forceinline__ int get_work( int *queue, int warp_id )
-{
-    int offset = -1;
-
-    if ( utils::lane_id() == 0 )
-    {
-        offset = atomicAdd( queue, 1 );
-    }
-
-    return utils::shfl( offset, 0 );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY >
-__global__ __launch_bounds__( CTA_SIZE )
-void
-count_non_zeroes_kernel( const int A_num_rows,
-                         const int *A_rows,
-                         const int *A_cols,
-                         const int *B_rows,
-                         const int *B_cols,
-                         int *C_rows,
-                         int *C_cols,
-                         int *Aq1,
-                         int *Bq1,
-                         int *Aq2,
-                         int *Bq2,
-                         const int gmem_size,
-                         int *g_keys,
-                         int *wk_work_queue,
-                         int *wk_status )
-{
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // The hash keys stored in shared memory.
-    __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
-    // The coordinates of the thread inside the CTA/warp.
-    const int warp_id = utils::warp_id();
-    const int lane_id = utils::lane_id();
-    // First threads load the row IDs of A needed by the CTA...
-    int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
-    // Create local storage for the set.
-    Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-
-    // Loop over rows of A.
-    for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) )
-    {
-        int c_row_id = a_row_id;
-
-        if (Aq1 != NULL)
-        {
-            a_row_id = Aq1[a_row_id];
-        }
-
-        // Make sure we have to proceed.
-        if ( COUNT_ONLY )
-        {
-            volatile int *status = reinterpret_cast<volatile int *>( wk_status );
-
-            if ( set.has_failed() || *status != 0 )
-            {
-                return;
-            }
-        }
-
-        // Clear the set.
-        set.clear();
-        // Load the range of the row.
-        int a_col_tmp = -1;
-
-        if ( lane_id < 2 )
-        {
-            a_col_tmp = utils::Ld<utils::LD_NC>::load( &A_rows[a_row_id + lane_id] );
-        }
-
-        int a_col_it  = utils::shfl( a_col_tmp, 0 );
-        int a_col_end = utils::shfl( a_col_tmp, 1 );
-
-        // Iterate over the columns of A.
-        for ( a_col_it += lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE )
-        {
-            // Is it an active thread.
-            const bool is_active = a_col_it < a_col_end;
-            // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID.
-            int b_row_id = -1;
-
-            if ( is_active )
-            {
-                b_row_id = utils::Ld<utils::LD_NC>::load( &A_cols[a_col_it] );
-
-                //b_row_id is actually column of A
-                if (Aq2 != NULL)
-                {
-                    b_row_id = Aq2[b_row_id];
-                }
-
-                if (Bq1 != NULL)
-                {
-                    b_row_id = Bq1[b_row_id];
-                }
-            }
-
-            // The number of valid rows.
-            const int num_rows = __popc( utils::ballot(is_active) );
-
-            // Uniform loop: threads collaborate to load other elements.
-            for ( int k = 0 ; k < num_rows ; ++k )
-            {
-                // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
-                const int uniform_b_row_id = utils::shfl( b_row_id, k );
-                // Load the range of the row of B.
-                int b_col_tmp = -1;
-
-                if ( lane_id < 2 )
-                {
-                    b_col_tmp = utils::Ld<utils::LD_NC>::load( &B_rows[uniform_b_row_id + lane_id] );
-                }
-
-                int b_col_it  = utils::shfl( b_col_tmp, 0 );
-                int b_col_end = utils::shfl( b_col_tmp, 1 );
-
-                // Iterate over the range of columns of B.
-                for ( b_col_it += lane_id ; utils::any(b_col_it < b_col_end) ; b_col_it += WARP_SIZE )
-                {
-                    int b_col_id = -1;
-
-                    if ( b_col_it < b_col_end )
-                    {
-                        b_col_id = utils::Ld<utils::LD_NC>::load( &B_cols[b_col_it] );
-
-                        // b_col_id is actually column of B
-                        if (Bq2 != NULL)
-                        {
-                            b_col_id = Bq2[b_col_id];
-                        }
-                    }
-
-                    set.insert( b_col_id, COUNT_ONLY ? wk_status : NULL );
-                }
-            }
-        }
-
-        // Store the results.
-        if ( COUNT_ONLY )
-        {
-            int count = set.compute_size();
-
-            if ( lane_id == 0 )
-            {
-                C_rows[c_row_id] = count;
-            }
-        }
-        else
-        {
-            int c_col_tmp = -1;
-
-            if ( lane_id < 2 )
-            {
-                c_col_tmp = utils::Ld<utils::LD_NC>::load( &C_rows[c_row_id + lane_id] );
-            }
-
-            int c_col_it  = utils::shfl( c_col_tmp, 0 );
-            int c_col_end = utils::shfl( c_col_tmp, 1 );
-            // Store the results.
-            int count = c_col_end - c_col_it;
-
-            if ( count == 0 )
-            {
-                continue;
-            }
-
-            set.store( count, &C_cols[c_col_it] );
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< int NUM_THREADS_PER_ROW, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY >
-__global__ __launch_bounds__( CTA_SIZE )
-void
-count_non_zeroes_kernel( const int A_num_rows,
-                         const int *__restrict A_rows,
-                         const int *__restrict A_cols,
-                         const int *__restrict B_rows,
-                         const int *__restrict B_cols,
-                         int *__restrict C_rows,
-                         int *__restrict C_cols,
-                         int *Aq1,
-                         int *Bq1,
-                         int *Aq2,
-                         int *Bq2,
-                         const int gmem_size,
-                         int *g_keys,
-                         int *wk_work_queue,
-                         int *wk_status )
-{
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
-    // The hash keys stored in shared memory.
-    __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE];
-    // The coordinates of the thread inside the CTA/warp.
-    const int warp_id = utils::warp_id( );
-    const int lane_id = utils::lane_id( );
-    // Constants.
-    const int lane_id_div_num_threads = lane_id / NUM_THREADS_PER_ROW;
-    const int lane_id_mod_num_threads = lane_id % NUM_THREADS_PER_ROW;
-    // First threads load the row IDs of A needed by the CTA...
-    int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
-    // Create local storage for the set.
-    Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-
-    // Loop over rows of A.
-    for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) )
-    {
-        int c_row_id = a_row_id;
-
-        if (Aq1 != NULL)
-        {
-            a_row_id = Aq1[a_row_id];
-        }
-
-        // Make sure we have to proceed.
-        if ( COUNT_ONLY )
-        {
-            volatile int *status = reinterpret_cast<volatile int *>( wk_status );
-
-            if ( set.has_failed() || *status != 0 )
-            {
-                return;
-            }
-        }
-
-        // Clear the set.
-        set.clear();
-        // Load the range of the row.
-        int a_col_tmp = -1;
-
-        if ( lane_id < 2 )
-        {
-            a_col_tmp = utils::Ld<utils::LD_NC>::load( &A_rows[a_row_id + lane_id] );
-        }
-
-        int a_col_it  = utils::shfl( a_col_tmp, 0 );
-        int a_col_end = utils::shfl( a_col_tmp, 1 );
-
-        // Iterate over the columns of A.
-        for ( a_col_it += lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE )
-        {
-            // Is it an active thread.
-            const bool is_active = a_col_it < a_col_end;
-            // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID.
-            int b_row_id = -1;
-
-            if ( is_active )
-            {
-                b_row_id = utils::Ld<utils::LD_NC>::load( &A_cols[a_col_it] );
-
-                //b_row_id is actually column of A
-                if (Aq2 != NULL)
-                {
-                    b_row_id = Aq2[b_row_id];
-                }
-
-                if (Bq1 != NULL)
-                {
-                    b_row_id = Bq1[b_row_id];
-                }
-            }
-
-            const int num_rows = __popc( utils::ballot(is_active) );
-
-            // Uniform loop: threads collaborate to load other elements.
-            for ( int k = 0 ; k < num_rows ; k += NUM_LOADED_ROWS )
-            {
-                int local_k = k + lane_id_div_num_threads;
-                // Is it an active thread.
-                bool is_active_k = local_k < num_rows;
-                // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
-                const int uniform_b_row_id = utils::shfl( b_row_id, local_k );
-                // Load the range of the row of B.
-                int b_col_tmp = -1;
-
-                if ( is_active_k && lane_id_mod_num_threads < 2 )
-                {
-                    b_col_tmp = utils::Ld<utils::LD_NC>::load( &B_rows[uniform_b_row_id + lane_id_mod_num_threads] );
-                }
-
-                int b_col_it  = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 0 );
-                int b_col_end = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 1 );
-
-                // Iterate over the range of columns of B.
-                for ( b_col_it += lane_id_mod_num_threads ; utils::any(b_col_it < b_col_end) ; b_col_it += NUM_THREADS_PER_ROW )
-                {
-                    int b_col_id = -1;
-
-                    if ( b_col_it < b_col_end )
-                    {
-                        b_col_id = utils::Ld<utils::LD_NC>::load( &B_cols[b_col_it] );
-
-                        // b_col_id is actually column of B
-                        if (Bq2 != NULL)
-                        {
-                            b_col_id = Bq2[b_col_id];
-                        }
-                    }
-
-                    set.insert( b_col_id, COUNT_ONLY ? wk_status : NULL );
-                }
-            }
-        }
-
-        // Store the results.
-        if ( COUNT_ONLY )
-        {
-            int count = set.compute_size_with_duplicates();
-
-            if ( lane_id == 0 )
-            {
-                C_rows[c_row_id] = count;
-            }
-        }
-        else
-        {
-            int c_col_tmp = -1;
-
-            if ( lane_id < 2 )
-            {
-                c_col_tmp = utils::Ld<utils::LD_NC>::load( &C_rows[c_row_id + lane_id] );
-            }
-
-            int c_col_it  = utils::shfl( c_col_tmp, 0 );
-            int c_col_end = utils::shfl( c_col_tmp, 1 );
-            // Store the results.
-            int count = c_col_end - c_col_it;
-
-            if ( count == 0 )
-            {
-                continue;
-            }
-
-            set.store( count, &C_cols[c_col_it] );
-        }
-    }
-}
-
-template <int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE, bool COUNT_ONLY >
-__device__ __forceinline__ void sparse_add_process_row(int row_id, const int *__restrict__ row_offsets, const int *__restrict__ col_indices, int lane_id, Hash_set<int, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE> &set, int *wk_status, int global_id, bool print_flag)
-{
-    // Load the range of the row of RAP_int
-    int col_tmp = -1;
-
-    if ( lane_id < 2 )
-    {
-        col_tmp = utils::Ld<utils::LD_NC>::load( &row_offsets[row_id + lane_id] );
-    }
-
-    int col_it  = utils::shfl( col_tmp, 0 );
-    int col_end = utils::shfl( col_tmp, 1 );
-
-    // Iterate over the columns of RAP_int
-    for ( col_it += lane_id ; utils::any(col_it < col_end) ; col_it += WARP_SIZE )
-    {
-        int col_id = -1;
-
-        if ( col_it < col_end )
-        {
-            col_id = utils::Ld<utils::LD_NC>::load( &col_indices[col_it] );
-        }
-
-        set.insert( col_id, COUNT_ONLY ? wk_status : NULL );
-    }
-}
-
-
-template <typename Value_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE>
-__device__ __forceinline__ void sparse_add_process_row_values(int row_id, const int *__restrict__ row_offsets, const int *__restrict__ col_indices, const Value_type *__restrict vals, int lane_id, Hash_map<int, Value_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE> &map, int *wk_status)
-{
-    // Load the range of the row.
-    int col_tmp = -1;
-
-    if ( lane_id < 2 )
-    {
-        col_tmp = utils::Ld<utils::LD_NC>::load( &row_offsets[row_id + lane_id] );
-    }
-
-    int col_it  = utils::shfl( col_tmp, 0 );
-    int col_end = utils::shfl( col_tmp, 1 );
-
-    // Iterate over the columns of A.
-    for ( col_it += lane_id ; utils::any(col_it < col_end) ; col_it += WARP_SIZE )
-    {
-        const bool is_active = col_it < col_end;
-        // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID.
-        int col_id(-1);
-        Value_type value = amgx::types::util<Value_type>::get_zero();
-
-        if ( is_active )
-        {
-            col_id = utils::Ld<utils::LD_NC>::load( &col_indices[col_it] );
-            value  = utils::Ld<utils::LD_NC>::load( &vals[col_it] );
-        }
-
-        map.insert_with_duplicates( col_id, value, wk_status );
-    }
-}
-
-
-
-
-template<int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY >
-__global__ __launch_bounds__( CTA_SIZE )
-void
-count_non_zeroes_RAP_ext_kernel( const int RAP_int_num_rows,
-                                 const int *__restrict RAP_int_rows,
-                                 const int *__restrict RAP_int_cols,
-                                 int **RAP_ext_row_ptrs,
-                                 int **RAP_ext_col_ptrs,
-                                 int *__restrict RAP_rows,
-                                 int *__restrict RAP_cols,
-                                 int **flagArray_ptrs,
-                                 const int gmem_size,
-                                 int *g_keys,
-                                 int *wk_work_queue,
-                                 int *wk_status,
-                                 int num_neighbors,
-                                 int global_id )
-{
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // The hash keys stored in shared memory.
-    __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE];
-    // The coordinates of the thread inside the CTA/warp.
-    const int warp_id = utils::warp_id( );
-    const int lane_id = utils::lane_id( );
-    // First threads load the row IDs of A needed by the CTA...
-    int rap_int_row_id = blockIdx.x * NUM_WARPS + warp_id;
-    // Create local storage for the set.
-    Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[rap_int_row_id * gmem_size], gmem_size );
-
-    // Loop over rows of RAP_ext
-    for ( ; rap_int_row_id < RAP_int_num_rows; rap_int_row_id = get_work( wk_work_queue, warp_id ) )
-    {
-        // Make sure we have to proceed.
-        if ( COUNT_ONLY )
-        {
-            volatile int *status = reinterpret_cast<volatile int *>( wk_status );
-
-            if ( set.has_failed() || *status != 0 )
-            {
-                return;
-            }
-        }
-
-        // Clear the set.
-        set.clear();
-        // ---------------------------------
-        // First process RAP_int
-        // ---------------------------------
-        bool print_flag = false;
-
-        if (rap_int_row_id == 3 && global_id == 0)
-        {
-            print_flag = true;
-        }
-
-        sparse_add_process_row<SMEM_SIZE, 4, WARP_SIZE, COUNT_ONLY>(rap_int_row_id, RAP_int_rows, RAP_int_cols, lane_id, set, wk_status, global_id, print_flag);
-
-        // ---------------------------------
-        // Then process RAP_ext
-        // ---------------------------------
-
-        for (int i = 0; i < num_neighbors; i++)
-        {
-            int flag = flagArray_ptrs[i][rap_int_row_id];
-
-            if (flag != -1)
-            {
-                int *RAP_ext_rows = RAP_ext_row_ptrs[i];
-                int *RAP_ext_cols = RAP_ext_col_ptrs[i];
-                int pos_in_row_ext = flag;
-                sparse_add_process_row<SMEM_SIZE, 4, WARP_SIZE, COUNT_ONLY>(pos_in_row_ext, RAP_ext_rows, RAP_ext_cols, lane_id, set, wk_status, global_id, print_flag);
-            }
-        }
-
-        // Store the results.
-        if ( COUNT_ONLY )
-        {
-            int count = set.compute_size_with_duplicates();
-
-            if ( lane_id == 0 )
-            {
-                RAP_rows[rap_int_row_id] = count;
-            }
-        }
-        else
-        {
-            int rap_col_tmp = -1;
-
-            if ( lane_id < 2 )
-            {
-                rap_col_tmp = utils::Ld<utils::LD_NC>::load( &RAP_rows[rap_int_row_id + lane_id] );
-            }
-
-            int rap_col_it  = utils::shfl( rap_col_tmp, 0 );
-            int rap_col_end = utils::shfl( rap_col_tmp, 1 );
-            // Store the results.
-            int count = rap_col_end - rap_col_it;
-
-            if ( count == 0 )
-            {
-                continue;
-            }
-
-            set.store( count, &RAP_cols[rap_col_it] );
-        }
-    }
-}
-
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct Without_external_diag
-{
-    static __device__ __forceinline__ bool is_active  ( int a_col_it, int a_col_end ) { return a_col_it < a_col_end; }
-    static __device__ __forceinline__ bool is_boundary( int a_col_it, int a_col_end ) { return false; }
-};
-
-// ====================================================================================================================
-
-struct With_external_diag
-{
-    static __device__ __forceinline__ bool is_active  ( int a_col_it, int a_col_end ) { return a_col_it <= a_col_end; }
-    static __device__ __forceinline__ bool is_boundary( int a_col_it, int a_col_end ) { return a_col_it == a_col_end; }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY, typename Diag_traits >
-__global__ __launch_bounds__( CTA_SIZE )
-void
-count_non_zeroes_ilu1_kernel( const int A_num_rows,
-                              const int *__restrict A_rows,
-                              const int *__restrict A_cols,
-                              const int *__restrict A_coloring,
-                              int *__restrict C_rows,
-                              int *__restrict C_cols,
-                              const int gmem_size,
-                              int *g_keys,
-                              int *wk_work_queue,
-                              int *wk_status )
-{
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // Tables to broadcast values.
-    __shared__ volatile int s_b_rows[CTA_SIZE], s_b_colors[CTA_SIZE];
-    // The hash keys stored in shared memory.
-    __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
-    // The coordinates of the thread inside the CTA/warp.
-    const int warp_id = utils::warp_id( );
-    const int lane_id = utils::lane_id( );
-    // First threads load the row IDs of A needed by the CTA...
-    int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
-    // Create local storage for the set.
-    Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-
-    // Loop over rows of A.
-    for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) )
-    {
-        // Make sure we have to proceed.
-        if ( COUNT_ONLY )
-        {
-            volatile int *status = reinterpret_cast<volatile int *>( wk_status );
-
-            if ( set.has_failed() || *status != 0 )
-            {
-                return;
-            }
-        }
-
-        // Clear the set.
-        set.clear();
-        // The color of the row.
-        int a_row_color = A_coloring[a_row_id];
-        // Load the range of the row.
-        int a_col_tmp = -1;
-
-        if ( lane_id < 2 )
-        {
-            a_col_tmp = utils::Ld<utils::LD_NC>::load( &A_rows[a_row_id + lane_id] );
-        }
-
-        int a_col_it  = utils::shfl( a_col_tmp, 0 );
-        int a_col_end = utils::shfl( a_col_tmp, 1 );
-
-        // Iterate over the columns of A.
-        for ( a_col_it += lane_id ; utils::any(Diag_traits::is_active(a_col_it, a_col_end)) ; a_col_it += WARP_SIZE )
-        {
-            // Is it an active thread.
-            const bool is_active = a_col_it < a_col_end;
-            // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID.
-            int b_row_id = -1;
-
-            if ( is_active )
-            {
-                b_row_id = utils::Ld<utils::LD_NC>::load( &A_cols[a_col_it] );
-            }
-
-            if ( Diag_traits::is_boundary(a_col_it, a_col_end) )
-            {
-                b_row_id = a_row_id;
-            }
-
-            // Push the columns in the set.
-            set.insert( b_row_id, COUNT_ONLY ? wk_status : NULL );
-
-            // Skip computation if the color of the row is 0.
-            if ( a_row_color != 0 )
-            {
-                // Gather the colors of the columns.
-                int b_row_color = -1;
-
-                if ( is_active )
-                {
-                    b_row_color = A_coloring[b_row_id];
-                }
-
-                // The number of valid rows.
-                int pred = is_active && b_row_color < a_row_color;
-                int vote = utils::ballot( pred );
-                int dest = __popc( vote & utils::lane_mask_lt() );
-
-                if ( pred )
-                {
-                    s_b_rows  [warp_id * WARP_SIZE + dest] = b_row_id;
-                    s_b_colors[warp_id * WARP_SIZE + dest] = b_row_color; // TODO: store an int2 rather than 2 ints of SM35 with 64bit banks.
-                }
-
-                const int num_rows = __popc( vote );
-
-                // Uniform loop: threads collaborate to load other elements.
-                for ( int k = 0 ; k < num_rows ; ++k )
-                {
-                    // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
-                    const int uniform_b_row_id = s_b_rows  [warp_id * WARP_SIZE + k];
-                    const int uniform_b_color  = s_b_colors[warp_id * WARP_SIZE + k];
-                    // Load the range of the row of B.
-                    int b_col_tmp = -1;
-
-                    if ( lane_id < 2 )
-                    {
-                        b_col_tmp = utils::Ld<utils::LD_NC>::load( &A_rows[uniform_b_row_id + lane_id] );
-                    }
-
-                    int b_col_it  = utils::shfl( b_col_tmp, 0 );
-                    int b_col_end = utils::shfl( b_col_tmp, 1 );
-
-                    // Iterate over the range of columns of B.
-                    for ( b_col_it += lane_id ; utils::any(b_col_it < b_col_end) ; b_col_it += WARP_SIZE )
-                    {
-                        int b_col_id = -1, b_col_color = -1;
-
-                        if ( b_col_it < b_col_end )
-                        {
-                            b_col_id    = utils::Ld<utils::LD_NC>::load( &A_cols[b_col_it] );
-                            b_col_color = utils::Ld<utils::LD_NC>::load( &A_coloring[b_col_id] );
-                        }
-
-                        int item = -1;
-
-                        if ( b_col_color >= uniform_b_color && b_col_color != a_row_color )
-                        {
-                            item = b_col_id;
-                        }
-
-                        set.insert( item, COUNT_ONLY ? wk_status : NULL );
-                    }
-                }
-            }
-        }
-
-        // Store the results.
-        if ( COUNT_ONLY )
-        {
-            int count = set.compute_size();
-
-            if ( lane_id == 0 )
-            {
-                C_rows[a_row_id] = count;
-            }
-        }
-        else
-        {
-            int c_col_tmp = -1;
-
-            if ( lane_id < 2 )
-            {
-                c_col_tmp = utils::Ld<utils::LD_NC>::load( &C_rows[a_row_id + lane_id] );
-            }
-
-            int c_col_it  = utils::shfl( c_col_tmp, 0 );
-            int c_col_end = utils::shfl( c_col_tmp, 1 );
-            // Store the results.
-            int count = c_col_end - c_col_it;
-
-            if ( count == 0 )
-            {
-                continue;
-            }
-
-            set.store( count, &C_cols[c_col_it] );
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< int NUM_THREADS_PER_ROW, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY, typename Diag_traits >
-__global__ __launch_bounds__( CTA_SIZE )
-void
-count_non_zeroes_ilu1_kernel( const int A_num_rows,
-                              const int *__restrict A_rows,
-                              const int *__restrict A_cols,
-                              const int *__restrict A_coloring,
-                              int *__restrict C_rows,
-                              int *__restrict C_cols,
-                              const int gmem_size,
-                              int *g_keys,
-                              int *wk_work_queue,
-                              int *wk_status )
-{
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
-    // Tables to broadcast values.
-    __shared__ volatile int s_b_rows[CTA_SIZE], s_b_colors[CTA_SIZE];
-    // The hash keys stored in shared memory.
-    __shared__ int s_keys[NUM_WARPS * SMEM_SIZE];
-    // The coordinates of the thread inside the CTA/warp.
-    const int warp_id = utils::warp_id( );
-    const int lane_id = utils::lane_id( );
-    // Constants.
-    const int lane_id_div_num_threads = lane_id / NUM_THREADS_PER_ROW;
-    const int lane_id_mod_num_threads = lane_id % NUM_THREADS_PER_ROW;
-    // First threads load the row IDs of A needed by the CTA...
-    int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
-    // Create local storage for the set.
-    Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-
-    // Loop over rows of A.
-    for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) )
-    {
-        // Make sure we have to proceed.
-        if ( COUNT_ONLY )
-        {
-            volatile int *status = reinterpret_cast<volatile int *>( wk_status );
-
-            if ( set.has_failed() || *status != 0 )
-            {
-                return;
-            }
-        }
-
-        // Clear the set.
-        set.clear();
-        // The color of the row.
-        int a_row_color = A_coloring[a_row_id];
-        // Load the range of the row.
-        int a_col_tmp = -1;
-
-        if ( lane_id < 2 )
-        {
-            a_col_tmp = utils::Ld<utils::LD_NC>::load( &A_rows[a_row_id + lane_id] );
-        }
-
-        int a_col_it  = utils::shfl( a_col_tmp, 0 );
-        int a_col_end = utils::shfl( a_col_tmp, 1 );
-
-        // Iterate over the columns of A.
-        for ( a_col_it += lane_id ; utils::any(Diag_traits::is_active(a_col_it, a_col_end)) ; a_col_it += WARP_SIZE )
-        {
-            // Is it an active thread.
-            const bool is_active = a_col_it < a_col_end;
-            // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID.
-            int b_row_id = -1;
-
-            if ( is_active )
-            {
-                b_row_id = utils::Ld<utils::LD_NC>::load( &A_cols[a_col_it] );
-            }
-
-            if ( Diag_traits::is_boundary(a_col_it, a_col_end) )
-            {
-                b_row_id = a_row_id;
-            }
-
-            // Push the columns in the set.
-            set.insert( b_row_id, COUNT_ONLY ? wk_status : NULL );
-
-            // Skip computation if the color of the row is 0.
-            if ( a_row_color != 0 )
-            {
-                // Gather the colors of the columns.
-                int b_row_color = -1;
-
-                if ( is_active )
-                {
-                    b_row_color = A_coloring[b_row_id];
-                }
-
-                // The number of valid rows.
-                int pred = is_active && b_row_color < a_row_color;
-                int vote = utils::ballot( pred );
-                int dest = __popc( vote & utils::lane_mask_lt() );
-
-                if ( pred )
-                {
-                    s_b_rows  [warp_id * WARP_SIZE + dest] = b_row_id;
-                    s_b_colors[warp_id * WARP_SIZE + dest] = b_row_color;
-                }
-
-                const int num_rows = __popc( vote );
-
-                // Uniform loop: threads collaborate to load other elements.
-                for ( int k = 0 ; k < num_rows ; k += NUM_LOADED_ROWS )
-                {
-                    int local_k = k + lane_id_div_num_threads;
-                    // Is it an active thread.
-                    bool is_active_k = local_k < num_rows;
-                    // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
-                    int uniform_b_row_id = -1, uniform_b_color = -1;
-
-                    if ( is_active_k )
-                    {
-                        uniform_b_row_id = s_b_rows  [warp_id * WARP_SIZE + local_k];
-                        uniform_b_color  = s_b_colors[warp_id * WARP_SIZE + local_k];
-                    }
-
-                    // Load the range of the row of B.
-                    int b_col_tmp = -1;
-
-                    if ( is_active_k && lane_id_mod_num_threads < 2 )
-                    {
-                        b_col_tmp = utils::Ld<utils::LD_NC>::load( &A_rows[uniform_b_row_id + lane_id_mod_num_threads] );
-                    }
-
-                    int b_col_it  = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 0 );
-                    int b_col_end = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 1 );
-
-                    // Iterate over the range of columns of B.
-                    for ( b_col_it += lane_id_mod_num_threads ; utils::any(b_col_it < b_col_end) ; b_col_it += NUM_THREADS_PER_ROW )
-                    {
-                        int b_col_id = -1, b_col_color = -1;
-
-                        if ( b_col_it < b_col_end )
-                        {
-                            b_col_id    = utils::Ld<utils::LD_NC>::load( &A_cols[b_col_it] );
-                            b_col_color = utils::Ld<utils::LD_NC>::load( &A_coloring[b_col_id] );
-                        }
-
-                        int item = -1;
-
-                        if ( b_col_color >= uniform_b_color && b_col_color != a_row_color )
-                        {
-                            item = b_col_id;
-                        }
-
-                        set.insert( item, COUNT_ONLY ? wk_status : NULL );
-                    }
-                }
-            }
-        }
-
-        // Store the results.
-        if ( COUNT_ONLY )
-        {
-            int count = set.compute_size_with_duplicates();
-
-            if ( lane_id == 0 )
-            {
-                C_rows[a_row_id] = count;
-            }
-        }
-        else
-        {
-            int c_col_tmp = -1;
-
-            if ( lane_id < 2 )
-            {
-                c_col_tmp = utils::Ld<utils::LD_NC>::load( &C_rows[a_row_id + lane_id] );
-            }
-
-            int c_col_it  = utils::shfl( c_col_tmp, 0 );
-            int c_col_end = utils::shfl( c_col_tmp, 1 );
-            // Store the results.
-            int count = c_col_end - c_col_it;
-
-            if ( count == 0 )
-            {
-                continue;
-            }
-
-            set.store( count, &C_cols[c_col_it] );
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE >
-__global__ __launch_bounds__( CTA_SIZE, 6 )
-void
-compute_values_kernel( const int A_num_rows,
-                       const int *__restrict A_rows,
-                       const int *__restrict A_cols,
-                       const Value_type *__restrict A_vals,
-                       const int *__restrict B_rows,
-                       const int *__restrict B_cols,
-                       const Value_type *__restrict B_vals,
-                       const int *__restrict C_rows,
-                       int *__restrict C_cols,
-                       Value_type *__restrict C_vals,
-                       int *Aq1,
-                       int *Bq1,
-                       int *Aq2,
-                       int *Bq2,
-                       const int gmem_size,
-                       int *g_keys,
-                       Value_type *g_vals,
-                       int *wk_work_queue,
-                       int *wk_status )
-{
-    const int NUM_WARPS = CTA_SIZE / 32;
-    // The hash keys stored in shared memory.
-    __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE];
-    // The hash values stored in shared memory.
-    __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4];
-    // The coordinates of the thread inside the CTA/warp.
-    const int warp_id = utils::warp_id();
-    const int lane_id = utils::lane_id();
-    // First threads load the row IDs of A needed by the CTA...
-    int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
-    // Create local storage for the set.
-    Hash_map<int, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
-            &g_keys[a_row_id * gmem_size],
-            &s_vote[warp_id * SMEM_SIZE / 4],
-            &g_vals[a_row_id * gmem_size],
-            gmem_size );
-
-    // Loop over rows of A.
-    for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) )
-    {
-        int c_row_id = a_row_id;
-
-        if (Aq1 != NULL)
-        {
-            a_row_id = Aq1[a_row_id];
-        }
-
-        // Clear the map.
-        map.clear();
-        // Load the range of the row.
-        int a_col_tmp = -1;
-
-        if ( lane_id < 2 )
-        {
-            a_col_tmp = utils::Ld<utils::LD_NC>::load( &A_rows[a_row_id + lane_id] );
-        }
-
-        int a_col_it  = utils::shfl( a_col_tmp, 0 );
-        int a_col_end = utils::shfl( a_col_tmp, 1 );
-
-        // Iterate over the columns of A.
-        for ( a_col_it += lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE )
-        {
-            // Is it an active thread.
-            const bool is_active = a_col_it < a_col_end;
-            // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID.
-            int b_row_id = -1;
-            Value_type a_value = amgx::types::util<Value_type>::get_zero();
-
-            if ( is_active )
-            {
-                b_row_id = utils::Ld<utils::LD_NC>::load( &A_cols[a_col_it] );
-                a_value  = utils::Ld<utils::LD_NC>::load( &A_vals[a_col_it] );
-
-                //b_row_id is actually column of A
-                if (Aq2 != NULL)
-                {
-                    b_row_id = Aq2[b_row_id];
-                }
-
-                if (Bq1 != NULL)
-                {
-                    b_row_id = Bq1[b_row_id];
-                }
-            }
-
-            const int num_rows = __popc( utils::ballot(is_active) );
-
-            // Uniform loop: threads collaborate to load other elements.
-            for ( int k = 0 ; k < num_rows ; ++k )
-            {
-                // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
-                const int uniform_b_row_id = utils::shfl( b_row_id, k );
-                // The value of A.
-                const Value_type uniform_a_value = utils::shfl( a_value, k );
-                // Load the range of the row of B.
-                int b_col_tmp = -1;
-
-                if ( lane_id < 2 )
-                {
-                    b_col_tmp = utils::Ld<utils::LD_NC>::load( &B_rows[uniform_b_row_id + lane_id] );
-                }
-
-                int b_col_it  = utils::shfl( b_col_tmp, 0 );
-                int b_col_end = utils::shfl( b_col_tmp, 1 );
-
-                // Iterate over the range of columns of B.
-                for ( b_col_it += lane_id ; utils::any(b_col_it < b_col_end) ; b_col_it += WARP_SIZE )
-                {
-                    int b_col_id = -1;
-                    Value_type b_value = amgx::types::util<Value_type>::get_zero();
-
-                    if ( b_col_it < b_col_end )
-                    {
-                        b_col_id = utils::Ld<utils::LD_NC>::load( &B_cols[b_col_it] );
-                        b_value  = utils::Ld<utils::LD_NC>::load( &B_vals[b_col_it] );
-
-                        if (Bq2 != NULL)
-                        {
-                            b_col_id = Bq2[b_col_id];
-                        }
-                    }
-
-                    map.insert( b_col_id, uniform_a_value, b_value, wk_status );
-                }
-            }
-        }
-
-        // Store the results.
-        int c_col_tmp = -1;
-
-        if ( lane_id < 2 )
-        {
-            c_col_tmp = utils::Ld<utils::LD_NC>::load( &C_rows[c_row_id + lane_id] );
-        }
-
-        int c_col_it  = utils::shfl( c_col_tmp, 0 );
-        int c_col_end = utils::shfl( c_col_tmp, 1 );
-        // Store the results.
-        int count = c_col_end - c_col_it;
-
-        if ( count == 0 )
-        {
-            continue;
-        }
-
-        map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] );
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< int NUM_THREADS_PER_ROW, typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE >
-__global__ __launch_bounds__( CTA_SIZE, 6 )
-void
-compute_values_kernel( const int A_num_rows,
-                       const int *__restrict A_rows,
-                       const int *__restrict A_cols,
-                       const Value_type *__restrict A_vals,
-                       const int *__restrict B_rows,
-                       const int *__restrict B_cols,
-                       const Value_type *__restrict B_vals,
-                       const int *__restrict C_rows,
-                       int *__restrict C_cols,
-                       Value_type *__restrict C_vals,
-                       int *Aq1,
-                       int *Bq1,
-                       int *Aq2,
-                       int *Bq2,
-                       const int gmem_size,
-                       int *g_keys,
-                       Value_type *g_vals,
-                       int *wk_work_queue,
-                       int *wk_status )
-{
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
-    // The hash keys stored in shared memory.
-    __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE];
-    // The hash values stored in shared memory.
-    __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4];
-    // The coordinates of the thread inside the CTA/warp.
-    const int warp_id = utils::warp_id( );
-    const int lane_id = utils::lane_id( );
-    // Constants.
-    const int lane_id_div_num_threads = lane_id / NUM_THREADS_PER_ROW;
-    const int lane_id_mod_num_threads = lane_id % NUM_THREADS_PER_ROW;
-    // First threads load the row IDs of A needed by the CTA...
-    int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
-    // Create local storage for the set.
-    Hash_map<int, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
-            &g_keys[a_row_id * gmem_size],
-            &s_vote[warp_id * SMEM_SIZE / 4],
-            &g_vals[a_row_id * gmem_size],
-            gmem_size );
-
-    // Loop over rows of A.
-    for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) )
-    {
-        int c_row_id = a_row_id;
-
-        if (Aq1 != NULL)
-        {
-            a_row_id = Aq1[a_row_id];
-        }
-
-        // Clear the map.
-        map.clear_all();
-        // Load the range of the row.
-        int a_col_tmp = -1;
-
-        if ( lane_id < 2 )
-        {
-            a_col_tmp = utils::Ld<utils::LD_NC>::load( &A_rows[a_row_id + lane_id] );
-        }
-
-        int a_col_it  = utils::shfl( a_col_tmp, 0 );
-        int a_col_end = utils::shfl( a_col_tmp, 1 );
-
-        // Iterate over the columns of A.
-        for ( a_col_it += lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE )
-        {
-            // Is it an active thread.
-            const bool is_active = a_col_it < a_col_end;
-            // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID.
-            int b_row_id(-1);
-            Value_type a_value = amgx::types::util<Value_type>::get_zero();
-
-            if ( is_active )
-            {
-                b_row_id = utils::Ld<utils::LD_NC>::load( &A_cols[a_col_it] );
-                a_value  = utils::Ld<utils::LD_NC>::load( &A_vals[a_col_it] );
-
-                //b_row_id is actually column of A
-                if (Aq2 != NULL)
-                {
-                    b_row_id = Aq2[b_row_id];
-                }
-
-                if (Bq1 != NULL)
-                {
-                    b_row_id = Bq1[b_row_id];
-                }
-            }
-
-            const int num_rows = __popc( utils::ballot(is_active) );
-
-            // Uniform loop: threads collaborate to load other elements.
-            for ( int k = 0 ; k < num_rows ; k += NUM_LOADED_ROWS )
-            {
-                int local_k = k + lane_id_div_num_threads;
-                // Is it an active thread.
-                bool is_active_k = local_k < num_rows;
-                // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
-                const int uniform_b_row_id = utils::shfl( b_row_id, k + lane_id_div_num_threads );
-                // The value of A.
-                const Value_type uniform_a_value = utils::shfl( a_value, k + lane_id_div_num_threads );
-                // Load the range of the row of B.
-                int b_col_tmp = -1;
-
-                if ( is_active_k && lane_id_mod_num_threads < 2 )
-                {
-                    b_col_tmp = utils::Ld<utils::LD_NC>::load( &B_rows[uniform_b_row_id + lane_id_mod_num_threads] );
-                }
-
-                int b_col_it  = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 0 );
-                int b_col_end = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 1 );
-
-                // Iterate over the range of columns of B.
-                for ( b_col_it += lane_id_mod_num_threads ; utils::any(b_col_it < b_col_end) ; b_col_it += NUM_THREADS_PER_ROW )
-                {
-                    int b_col_id(-1);
-                    Value_type b_value = amgx::types::util<Value_type>::get_zero();
-
-                    if ( b_col_it < b_col_end )
-                    {
-                        b_col_id = utils::Ld<utils::LD_NC>::load( &B_cols[b_col_it] );
-                        b_value  = utils::Ld<utils::LD_NC>::load( &B_vals[b_col_it] );
-
-                        //b_col_id is actually column of B
-                        if (Bq2 != NULL)
-                        {
-                            b_col_id = Bq2[b_col_id];
-                        }
-                    }
-
-                    map.insert_with_duplicates( b_col_id, uniform_a_value * b_value, wk_status );
-                }
-            }
-        }
-
-        // Store the results.
-        int c_col_tmp = -1;
-
-        if ( lane_id < 2 )
-        {
-            c_col_tmp = utils::Ld<utils::LD_NC>::load( &C_rows[c_row_id + lane_id] );
-        }
-
-        int c_col_it  = utils::shfl( c_col_tmp, 0 );
-        int c_col_end = utils::shfl( c_col_tmp, 1 );
-        // Store the results.
-        int count = c_col_end - c_col_it;
-
-        if ( count == 0 )
-        {
-            continue;
-        }
-
-        map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] );
-    }
-}
-
-
-template< typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE >
-__global__ __launch_bounds__( CTA_SIZE, 6 )
-void
-compute_values_RAP_ext_kernel( const int RAP_int_num_rows,
-                               const int *__restrict RAP_int_rows,
-                               const int *__restrict RAP_int_cols,
-                               const Value_type *__restrict RAP_int_vals,
-                               int **RAP_ext_row_ptrs,
-                               int **RAP_ext_col_ptrs,
-                               Value_type **RAP_ext_val_ptrs,
-                               int *__restrict RAP_rows,
-                               int *__restrict RAP_cols,
-                               Value_type *__restrict RAP_vals,
-                               int **flagArray_ptrs,
-                               const int gmem_size,
-                               int *g_keys,
-                               Value_type *g_vals,
-                               int *wk_work_queue,
-                               int num_neighbors,
-                               int *wk_status )
-{
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // The hash keys stored in shared memory.
-    __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE];
-    // The hash values stored in shared memory.
-    __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4];
-    // The coordinates of the thread inside the CTA/warp.
-    const int warp_id = utils::warp_id( );
-    const int lane_id = utils::lane_id( );
-    // First threads load the row IDs of A needed by the CTA...
-    int rap_int_row_id = blockIdx.x * NUM_WARPS + warp_id;
-    // Create local storage for the set.
-    Hash_map<int, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
-            &g_keys[rap_int_row_id * gmem_size],
-            &s_vote[warp_id * SMEM_SIZE / 4],
-            &g_vals[rap_int_row_id * gmem_size],
-            gmem_size );
-
-    // Loop over rows of RAP_ext
-    for ( ; rap_int_row_id < RAP_int_num_rows ; rap_int_row_id = get_work( wk_work_queue, warp_id ) )
-    {
-        // Clear the map.
-        map.clear_all();
-        // ---------------------------------
-        // First process RAP_int
-        // ---------------------------------
-        sparse_add_process_row_values<Value_type, SMEM_SIZE, 4, WARP_SIZE>(rap_int_row_id, RAP_int_rows, RAP_int_cols, RAP_int_vals, lane_id, map, wk_status);
-
-        // ---------------------------------
-        // Then process RAP_ext
-        // ---------------------------------
-
-        for (int i = 0; i < num_neighbors; i++)
-        {
-            int flag = flagArray_ptrs[i][rap_int_row_id];
-
-            if (flag != -1)
-            {
-                int *RAP_ext_rows = RAP_ext_row_ptrs[i];
-                int *RAP_ext_cols = RAP_ext_col_ptrs[i];
-                Value_type *RAP_ext_vals = RAP_ext_val_ptrs[i];
-                int pos_in_row_ext = flag;
-                sparse_add_process_row_values<Value_type, SMEM_SIZE, 4, WARP_SIZE>(pos_in_row_ext, RAP_ext_rows, RAP_ext_cols, RAP_ext_vals, lane_id, map, wk_status);
-            }
-        }
-
-        // Store the results.
-        int rap_col_tmp = -1;
-
-        if ( lane_id < 2 )
-        {
-            rap_col_tmp = utils::Ld<utils::LD_NC>::load( &RAP_rows[rap_int_row_id + lane_id] );
-        }
-
-        int rap_col_it  = utils::shfl( rap_col_tmp, 0 );
-        int rap_col_end = utils::shfl( rap_col_tmp, 1 );
-        // Store the results.
-        int count = rap_col_end - rap_col_it;
-
-        if ( count == 0 )
-        {
-            continue;
-        }
-
-        map.store( count, &RAP_cols[rap_col_it], &RAP_vals[rap_col_it] );
-    }
-}
-
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace csr_multiply_sm70
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace amgx
-{
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-enum { WARP_SIZE = 32, SMEM_SIZE = 128 };
-
-// ====================================================================================================================
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::CSR_Multiply_Sm70( bool allocate_values, int grid_size, int max_warp_count, int gmem_size ) :
-    Base(allocate_values, grid_size, max_warp_count, gmem_size)
-{}
-
-// ====================================================================================================================
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-void CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::count_non_zeroes( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2 )
-{
-    const int GRID_SIZE = 128;
-    const int CTA_SIZE  = 256;
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // Reset work queue.
-    int work_offset = GRID_SIZE * NUM_WARPS;
-    CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) );
-
-    // Compute non-zero elements.
-    switch ( this->m_num_threads_per_row_count )
-    {
-        case 2:
-            csr_multiply_sm70::count_non_zeroes_kernel< 2, CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                NULL,
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                this->m_status );
-            break;
-
-        case 4:
-            csr_multiply_sm70::count_non_zeroes_kernel< 4, CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                NULL,
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                this->m_status );
-            break;
-
-        case 8:
-            csr_multiply_sm70::count_non_zeroes_kernel< 8, CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                NULL,
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                this->m_status );
-            break;
-
-        case 16:
-            csr_multiply_sm70::count_non_zeroes_kernel<16, CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                NULL,
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                this->m_status );
-            break;
-
-        default:
-            csr_multiply_sm70::count_non_zeroes_kernel<CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                NULL,
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                this->m_status );
-    }
-
-    cudaCheckError();
-    //CUDA_SAFE_CALL( cudaGetLastError() );
-}
-
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-void CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::count_non_zeroes_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector<IVector> &RAP_ext_row_offsets, std::vector<IVector> &RAP_ext_col_indices, std::vector<MVector> &RAP_ext_values, std::vector<IVector> &RAP_ext_row_ids)
-
-{
-    const int GRID_SIZE = 128;
-    const int CTA_SIZE  = 256;
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // Reset work queue.
-    int work_offset = GRID_SIZE * NUM_WARPS;
-    CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) );
-    // This is num_owned_coarse_rows
-    int RAP_size = RAP.get_num_rows();
-    int RAP_int_size = RAP_int.row_offsets.size() - 1;
-
-    if (RAP_int_size < RAP_size)
-    {
-        FatalError("RAP_int has less rows than RAP, need to modify sparse RAP add to handle that case\n", AMGX_ERR_NOT_IMPLEMENTED);
-    }
-
-    // Create a device vector of the raw pointers to the array
-    // RAP_ext_row_ids_ptrs
-    // RAP_ext_row_offsets_ptrs
-    // RAP_ext_col_indices_ptrs
-    // RAP_ext_values_ptrs
-    int num_neighbors = RAP_ext_row_offsets.size();
-    std::vector<IVector> flagArray(num_neighbors);
-
-    for (int i = 0; i < num_neighbors; i++)
-    {
-        flagArray[i].resize(RAP_size);
-        thrust::fill(flagArray[i].begin(), flagArray[i].end(), -1);
-    }
-
-    cudaCheckError();
-    std::vector<int *> flagArray_ptrs_h(num_neighbors);
-    std::vector<int *> RAP_ext_row_offsets_ptrs_h(num_neighbors);
-    std::vector<int *> RAP_ext_col_indices_ptrs_h(num_neighbors);
-
-    for (int i = 0; i < num_neighbors; i++)
-    {
-        flagArray_ptrs_h[i] = thrust::raw_pointer_cast(&flagArray[i][0]);
-        RAP_ext_row_offsets_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_row_offsets[i][0]);
-        RAP_ext_col_indices_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_col_indices[i][0]);
-    }
-
-    device_vector_alloc<int *> flagArray_ptrs = flagArray_ptrs_h;
-    device_vector_alloc<int *> RAP_ext_row_offsets_ptrs = RAP_ext_row_offsets_ptrs_h;
-    device_vector_alloc<int *> RAP_ext_col_indices_ptrs = RAP_ext_col_indices_ptrs_h;
-
-    for (int i = 0; i < num_neighbors; i++)
-    {
-        int size = RAP_ext_row_ids[i].size();
-
-        if (size != 0)
-        {
-            int num_blocks = min(4096, (size + 127) / 128);
-            //write the position in RAP_ext_row_ids
-            csr_multiply_sm70::flag_halo_rows <<< num_blocks, 128>>>(
-                RAP_ext_row_ids[i].raw(),
-                size,
-                flagArray[i].raw(),
-                i,
-                RAP.manager->global_id());
-        }
-    }
-
-    csr_multiply_sm70::count_non_zeroes_RAP_ext_kernel<CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>(
-        RAP_size,
-        RAP_int.row_offsets.raw(),
-        RAP_int.col_indices.raw(),
-        thrust::raw_pointer_cast(&RAP_ext_row_offsets_ptrs[0]),
-        thrust::raw_pointer_cast(&RAP_ext_col_indices_ptrs[0]),
-        RAP.row_offsets.raw(),
-        (int *) NULL,
-        thrust::raw_pointer_cast(&flagArray_ptrs[0]),
-        this->m_gmem_size,
-        this->m_keys,
-        this->m_work_queue,
-        this->m_status,
-        num_neighbors,
-        RAP.manager->global_id() );
-    cudaCheckError();
-    //CUDA_SAFE_CALL( cudaGetLastError() );
-}
-
-// ====================================================================================================================
-
-template< int CTA_SIZE, bool COUNT_ONLY, typename Diag_traits, typename Matrix >
-static void
-count_non_zeroes_ilu1_dispatch( const Matrix &A, Matrix &B, int num_threads_per_row_count, int gmem_size, int *keys, int *work_queue, int *status )
-{
-    const int GRID_SIZE = 128;
-
-    switch ( num_threads_per_row_count )
-    {
-        case 2:
-            csr_multiply_sm70::count_non_zeroes_ilu1_kernel< 2, CTA_SIZE, SMEM_SIZE, WARP_SIZE, COUNT_ONLY, Diag_traits> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                gmem_size,
-                keys,
-                work_queue,
-                status );
-            break;
-
-        case 4:
-            csr_multiply_sm70::count_non_zeroes_ilu1_kernel< 4, CTA_SIZE, SMEM_SIZE, WARP_SIZE, COUNT_ONLY, Diag_traits> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                gmem_size,
-                keys,
-                work_queue,
-                status );
-            break;
-
-        case 8:
-            csr_multiply_sm70::count_non_zeroes_ilu1_kernel< 8, CTA_SIZE, SMEM_SIZE, WARP_SIZE, COUNT_ONLY, Diag_traits> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                gmem_size,
-                keys,
-                work_queue,
-                status );
-            break;
-
-        case 16:
-            csr_multiply_sm70::count_non_zeroes_ilu1_kernel<16, CTA_SIZE, SMEM_SIZE, WARP_SIZE, COUNT_ONLY, Diag_traits> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                gmem_size,
-                keys,
-                work_queue,
-                status );
-            break;
-
-        default:
-            csr_multiply_sm70::count_non_zeroes_ilu1_kernel<CTA_SIZE, SMEM_SIZE, WARP_SIZE, COUNT_ONLY, Diag_traits> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                gmem_size,
-                keys,
-                work_queue,
-                status );
-    }
-
-    cudaCheckError();
-}
-
-// ====================================================================================================================
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-void CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::count_non_zeroes_ilu1( const Matrix_d &A, Matrix_d &B )
-{
-    const int GRID_SIZE = 128;
-
-    const int CTA_SIZE  = 256;
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // Reset work queue.
-    int work_offset = GRID_SIZE * NUM_WARPS;
-    CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) );
-
-    // Count the number of non zeroes.
-    if ( A.hasProps(DIAG) )
-        count_non_zeroes_ilu1_dispatch<CTA_SIZE, true, csr_multiply_sm70::With_external_diag, Matrix_d>(
-            A,
-            B,
-            this->m_num_threads_per_row_count,
-            this->m_gmem_size,
-            this->m_keys,
-            this->m_work_queue,
-            this->m_status );
-    else
-        count_non_zeroes_ilu1_dispatch<CTA_SIZE, true, csr_multiply_sm70::Without_external_diag, Matrix_d>(
-            A,
-            B,
-            this->m_num_threads_per_row_count,
-            this->m_gmem_size,
-            this->m_keys,
-            this->m_work_queue,
-            this->m_status );
-
-    // Compute non-zero elements.
-    CUDA_SAFE_CALL( cudaGetLastError() );
-}
-
-// ====================================================================================================================
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-void CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::compute_offsets( Matrix_d &C )
-{
-    thrust::device_ptr<int> offsets_begin(C.row_offsets.raw());
-    thrust::device_ptr<int> offsets_end  (C.row_offsets.raw() + C.get_num_rows() + 1);
-    thrust::exclusive_scan( offsets_begin, offsets_end, offsets_begin );
-    cudaCheckError();
-}
-
-// ====================================================================================================================
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-void CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::compute_sparsity( const Matrix_d &A, const Matrix_d &B, Matrix_d &C )
-{
-    const int GRID_SIZE = 128;
-    const int CTA_SIZE  = 256;
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // std::cerr << "CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::compute_sparsity" << std::endl;
-    // Reset the work queue.
-    int work_offset = GRID_SIZE * NUM_WARPS;
-    CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) );
-
-    // Compute the values.
-    switch ( this->m_num_threads_per_row_count )
-    {
-        case 2:
-            csr_multiply_sm70::count_non_zeroes_kernel< 2, CTA_SIZE, SMEM_SIZE, WARP_SIZE, false> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                NULL,
-                NULL,
-                NULL,
-                NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                NULL );
-            break;
-
-        case 4:
-            csr_multiply_sm70::count_non_zeroes_kernel< 4, CTA_SIZE, SMEM_SIZE, WARP_SIZE, false> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                NULL,
-                NULL,
-                NULL,
-                NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                NULL );
-            break;
-
-        case 8:
-            csr_multiply_sm70::count_non_zeroes_kernel< 8, CTA_SIZE, SMEM_SIZE, WARP_SIZE, false> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                NULL,
-                NULL,
-                NULL,
-                NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                NULL );
-            break;
-
-        case 16:
-            csr_multiply_sm70::count_non_zeroes_kernel<16, CTA_SIZE, SMEM_SIZE, WARP_SIZE, false> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                NULL,
-                NULL,
-                NULL,
-                NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                NULL );
-            break;
-
-        default:
-            csr_multiply_sm70::count_non_zeroes_kernel<CTA_SIZE, SMEM_SIZE, WARP_SIZE, false> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                NULL,
-                NULL,
-                NULL,
-                NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_work_queue,
-                NULL );
-    }
-
-    cudaCheckError();
-    //CUDA_SAFE_CALL( cudaGetLastError() );
-}
-
-// ====================================================================================================================
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-void CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::compute_sparsity_ilu1( const Matrix_d &A, Matrix_d &B )
-{
-    const int GRID_SIZE = 128;
-
-    const int CTA_SIZE  = 256;
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // Reset work queue.
-    int work_offset = GRID_SIZE * NUM_WARPS;
-    CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) );
-
-    // Count the number of non zeroes.
-    if ( A.hasProps(DIAG) )
-        count_non_zeroes_ilu1_dispatch<CTA_SIZE, false, csr_multiply_sm70::With_external_diag, Matrix_d>(
-            A,
-            B,
-            this->m_num_threads_per_row_count,
-            this->m_gmem_size,
-            this->m_keys,
-            this->m_work_queue,
-            NULL );
-    else
-        count_non_zeroes_ilu1_dispatch<CTA_SIZE, false, csr_multiply_sm70::Without_external_diag, Matrix_d>(
-            A,
-            B,
-            this->m_num_threads_per_row_count,
-            this->m_gmem_size,
-            this->m_keys,
-            this->m_work_queue,
-            NULL );
-
-    // Make sure it worked properly.
-    CUDA_SAFE_CALL( cudaGetLastError() );
-}
-
-// ====================================================================================================================
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-void CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::compute_values( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, int num_threads, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2  )
-{
-    const int GRID_SIZE = 256;
-    const int CTA_SIZE  = 128;
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // Reset the work queue.
-    int work_offset = GRID_SIZE * NUM_WARPS;
-    CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) );
-    // Compute the values.
-    int *status = NULL;
-
-    if ( num_threads != this->m_num_threads_per_row_compute )
-    {
-        status = this->m_status;
-    }
-
-    switch ( num_threads )
-    {
-        case 2:
-            csr_multiply_sm70::compute_values_kernel< 2, Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                A.values.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                B.values.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                C.values.raw(),
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_vals,
-                this->m_work_queue,
-                status );
-            break;
-
-        case 4:
-            csr_multiply_sm70::compute_values_kernel< 4, Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                A.values.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                B.values.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                C.values.raw(),
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_vals,
-                this->m_work_queue,
-                status );
-            break;
-
-        case 8:
-            csr_multiply_sm70::compute_values_kernel< 8, Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                A.values.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                B.values.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                C.values.raw(),
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_vals,
-                this->m_work_queue,
-                status );
-            break;
-
-        case 16:
-            csr_multiply_sm70::compute_values_kernel<16, Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                A.values.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                B.values.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                C.values.raw(),
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_vals,
-                this->m_work_queue,
-                status );
-            break;
-
-        default:
-            csr_multiply_sm70::compute_values_kernel<Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>(
-                A.get_num_rows(),
-                A.row_offsets.raw(),
-                A.col_indices.raw(),
-                A.values.raw(),
-                B.row_offsets.raw(),
-                B.col_indices.raw(),
-                B.values.raw(),
-                C.row_offsets.raw(),
-                C.col_indices.raw(),
-                C.values.raw(),
-                (Aq1 != NULL) ? Aq1->raw() : NULL,
-                (Bq1 != NULL) ? Bq1->raw() : NULL,
-                (Aq2 != NULL) ? Aq2->raw() : NULL,
-                (Bq2 != NULL) ? Bq2->raw() : NULL,
-                this->m_gmem_size,
-                this->m_keys,
-                this->m_vals,
-                this->m_work_queue,
-                status );
-    }
-
-    cudaCheckError();
-    //CUDA_SAFE_CALL( cudaGetLastError() );
-}
-
-
-template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I >
-void CSR_Multiply_Sm70<TemplateConfig<AMGX_device, V, M, I> >::compute_values_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector<IVector> &RAP_ext_row_offsets, std::vector<IVector> &RAP_ext_col_indices, std::vector<MVector> &RAP_ext_values, std::vector<IVector> &RAP_ext_row_ids, int num_threads)
-{
-    const int GRID_SIZE = 128;
-    const int CTA_SIZE  = 128;
-    const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
-    // Reset the work queue.
-    int work_offset = GRID_SIZE * NUM_WARPS;
-    CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) );
-    // Compute the values.
-    int *status = NULL;
-
-    if ( num_threads != this->m_num_threads_per_row_compute )
-    {
-        status = this->m_status;
-    }
-
-    // This is num_owned_coarse_rows
-    int RAP_size = RAP.get_num_rows();
-    int RAP_int_size = RAP_int.row_offsets.size() - 1;
-
-    if (RAP_int_size < RAP_size)
-    {
-        FatalError("RAP_int has less rows than RAP, need to modify sparse RAP add to handle that case\n", AMGX_ERR_NOT_IMPLEMENTED);
-    }
-
-    //TODO: Optimize: reuse arrays from count nonzeros
-    int num_neighbors = RAP_ext_row_offsets.size();
-    std::vector<IVector> flagArray(num_neighbors);
-
-    for (int i = 0; i < num_neighbors; i++)
-    {
-        flagArray[i].resize(RAP_size);
-        thrust::fill(flagArray[i].begin(), flagArray[i].end(), -1);
-    }
-
-    cudaCheckError();
-    std::vector<int *> flagArray_ptrs_h(num_neighbors);
-    std::vector<int *> RAP_ext_row_offsets_ptrs_h(num_neighbors);
-    std::vector<int *> RAP_ext_col_indices_ptrs_h(num_neighbors);
-    std::vector<Value_type *> RAP_ext_values_ptrs_h(num_neighbors);
-
-    for (int i = 0; i < num_neighbors; i++)
-    {
-        flagArray_ptrs_h[i] = thrust::raw_pointer_cast(&flagArray[i][0]);
-        RAP_ext_row_offsets_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_row_offsets[i][0]);
-        RAP_ext_col_indices_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_col_indices[i][0]);
-        RAP_ext_values_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_values[i][0]);
-    }
-
-    device_vector_alloc<int *> flagArray_ptrs = flagArray_ptrs_h;
-    device_vector_alloc<int *> RAP_ext_row_offsets_ptrs = RAP_ext_row_offsets_ptrs_h;
-    device_vector_alloc<int *> RAP_ext_col_indices_ptrs = RAP_ext_col_indices_ptrs_h;
-    device_vector_alloc<Value_type *> RAP_ext_values_ptrs = RAP_ext_values_ptrs_h;
-
-    for (int i = 0; i < num_neighbors; i++)
-    {
-        int size = RAP_ext_row_ids[i].size();
-
-        if (size != 0)
-        {
-            int num_blocks = min(4096, (size + 127) / 128);
-            //write the position in RAP_ext_row_ids
-            csr_multiply_sm70::flag_halo_rows <<< num_blocks, 128>>>(
-                RAP_ext_row_ids[i].raw(),
-                size,
-                flagArray[i].raw(),
-                i,
-                RAP.manager->global_id());
-        }
-    }
-
-    cudaCheckError();
-    //CUDA_SAFE_CALL( cudaGetLastError() );
-    csr_multiply_sm70::compute_values_RAP_ext_kernel< Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>(
-        RAP_size,
-        RAP_int.row_offsets.raw(),
-        RAP_int.col_indices.raw(),
-        RAP_int.values.raw(),
-        thrust::raw_pointer_cast(&RAP_ext_row_offsets_ptrs[0]),
-        thrust::raw_pointer_cast(&RAP_ext_col_indices_ptrs[0]),
-        thrust::raw_pointer_cast(&RAP_ext_values_ptrs[0]),
-        RAP.row_offsets.raw(),
-        RAP.col_indices.raw(),
-        RAP.values.raw(),
-        thrust::raw_pointer_cast(&flagArray_ptrs[0]),
-        this->m_gmem_size,
-        this->m_keys,
-        this->m_vals,
-        this->m_work_queue,
-        num_neighbors,
-        status );
-    cudaCheckError();
-    //CUDA_SAFE_CALL( cudaGetLastError() );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define AMGX_CASE_LINE(CASE) template class CSR_Multiply_Sm70<TemplateMode<CASE>::Type>;
-AMGX_FORALL_BUILDS(AMGX_CASE_LINE)
-AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE)
-#undef AMGX_CASE_LINE
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace amgx
-
diff --git a/base/src/multiply.cu b/base/src/multiply.cu
index 4de2781b..333d756d 100644
--- a/base/src/multiply.cu
+++ b/base/src/multiply.cu
@@ -395,10 +395,10 @@ void blockDiaCsrMultiplyKernel(const IndexType *row_offsets,
 
 template< typename IndexType, typename ValueTypeA, typename ValueTypeB, int CTA_SIZE, bool ROW_MAJOR >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 16 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 16 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets,
         const IndexType *column_indices,
@@ -418,6 +418,11 @@ void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets,
     const int laneId_div_16 = laneId / 16;
     const int upperHalf = 16 * laneId_div_16;
     const int upperMask = 0xffff << upperHalf;
+#if __CUDA_ARCH__ < 300
+    const int nWarps = CTA_SIZE / 32; // Number of half warps per CTA.
+    const int warpId = threadIdx.x / 32;
+    volatile __shared__ IndexType s_aColIds[nWarps][32];
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile ValueTypeB s_mem[CTA_SIZE];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -465,13 +470,20 @@ void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets,
                 aColId = column_indices[aColIt];
             }
 
+#if __CUDA_ARCH__ < 300
+            s_aColIds[warpId][laneId] = aColId;
+#endif
 
             // Loop over columns. We compute 8 columns per iteration.
             for ( int k = 0, nCols = __popc( utils::ballot(aColId != -1) & upperMask ) ; k < nCols ; k += 4 )
             {
                 int my_k = k + halfLaneId_div_4;
                 // Exchange column indices.
+#if __CUDA_ARCH__ >= 300
                 int waColId = utils::shfl( aColId, upperHalf + my_k );
+#else
+                int waColId = s_aColIds[warpId][upperHalf + my_k];
+#endif
                 // Load 8 blocks of X if needed.
                 ValueTypeB my_x = types::util<ValueTypeB>::get_zero();
 
@@ -513,6 +525,8 @@ void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets,
         } // Loop over aColIt
 
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
+
         if ( ROW_MAJOR )
         {
             my_Ax = my_Ax + utils::shfl_xor( my_Ax, 1 );
@@ -524,6 +538,40 @@ void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets,
             my_Ax = my_Ax + utils::shfl_xor( my_Ax, 8 );
         }
 
+#else
+        types::util<ValueTypeB>::volcast(my_Ax, s_mem + threadIdx.x);
+        {
+            if ( ROW_MAJOR )
+            {
+                if ( laneId < 31 )
+                {
+                    my_Ax = my_Ax + types::util<ValueTypeB>::volcast(s_mem[threadIdx.x + 1]);
+                    types::util<ValueTypeB>::volcast(my_Ax, s_mem + threadIdx.x);
+                }
+
+                if ( laneId < 30 )
+                {
+                    my_Ax = my_Ax + types::util<ValueTypeB>::volcast(s_mem[threadIdx.x + 2]);
+                    types::util<ValueTypeB>::volcast(my_Ax, s_mem + threadIdx.x);
+                }
+            }
+            else
+            {
+                if ( laneId < 31 )
+                {
+                    my_Ax = my_Ax + types::util<ValueTypeB>::volcast(s_mem[threadIdx.x + 4]);
+                    types::util<ValueTypeB>::volcast(my_Ax, s_mem + threadIdx.x);
+                }
+
+                if ( laneId < 30 )
+                {
+                    my_Ax = my_Ax + types::util<ValueTypeB>::volcast(s_mem[threadIdx.x + 8]);
+                    types::util<ValueTypeB>::volcast(my_Ax, s_mem + threadIdx.x);
+                }
+            }
+        }
+#endif
+
         // Store the results.
         if ( ROW_MAJOR )
         {
diff --git a/core/include/matrix_coloring/bfs.h b/core/include/matrix_coloring/bfs.h
index 7fa21715..82b88dd0 100644
--- a/core/include/matrix_coloring/bfs.h
+++ b/core/include/matrix_coloring/bfs.h
@@ -42,6 +42,7 @@ template<int CTA_SIZE, int WARP_SIZE, class T> __device__ __forceinline__ T warp
     const int warpId = utils::warp_id();
     const int laneId = utils::lane_id();
     T value = input;
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
 
     for (int i = 1; i < WARP_SIZE; i *= 2)
@@ -54,6 +55,22 @@ template<int CTA_SIZE, int WARP_SIZE, class T> __device__ __forceinline__ T warp
         }
     }
 
+#else
+    volatile __shared__ T s_shfl[CTA_SIZE + WARP_SIZE / 2];
+#pragma unroll
+
+    for (int i = 1; i < WARP_SIZE; i *= 2)
+    {
+        s_shfl[threadIdx.x] = value;
+        T n = s_shfl[threadIdx.x - i];
+
+        if (laneId >= i)
+        {
+            value += n;
+        }
+    }
+
+#endif
     return value;
 }
 
@@ -147,7 +164,11 @@ __global__ void bfs_expand(
 
         for (int i = 0; i < n_neighbors; ++i)
         {
+#if __CUDA_ARCH__ >= 350
             int col = __ldg(col_indices + row_begin + i);
+#else
+            int col = col_indices[row_begin + i];
+#endif
             task_queue_out[block_tail_ + n_neighbors_exclusive_scan + i] = col;
         }
 
@@ -165,7 +186,15 @@ struct filter_visited_closure
 
     __device__ inline int operator()(const int &index)
     {
+#if __CUDA_ARCH__ >= 350
+
         if (__ldg(distances_ptr + index) >= 0) { return 1; }
+
+#else
+
+        if (distances_ptr[index] >= 0) { return 1; }
+
+#endif
         return 0;
     }
 };
diff --git a/core/include/matrix_coloring/coloring_utils.h b/core/include/matrix_coloring/coloring_utils.h
index 5a1997c8..99dfb278 100644
--- a/core/include/matrix_coloring/coloring_utils.h
+++ b/core/include/matrix_coloring/coloring_utils.h
@@ -232,7 +232,9 @@ struct used_color_structure_64_bit
     template<int CTA_SIZE, int WARP_SIZE>
     __device__ __forceinline__ void sync_subwarp(const int sublane_id)
     {
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
+
         for (int i = WARP_SIZE / 2; i >= 1; i /= 2)
         {
             int tmp_hi = __double2hiint( __longlong_as_double( bitfield ) );
@@ -242,6 +244,20 @@ struct used_color_structure_64_bit
             long long tmp = __double_as_longlong(__hiloint2double(tmp_hi, tmp_lo));
             bitfield |= tmp;
         }
+
+#else
+        __shared__ volatile long long s_used_colors[CTA_SIZE + WARP_SIZE / 2];
+#pragma unroll
+
+        for (int i = 1; i <= WARP_SIZE / 2; i *= 2)
+        {
+            s_used_colors[threadIdx.x] = bitfield;
+            long long tmp = s_used_colors[threadIdx.x + i];
+
+            if (sublane_id + i < WARP_SIZE) { bitfield |= tmp; }
+        }
+
+#endif
     }
     __device__ __forceinline__ void aggregate(const used_color_structure_64_bit &b)
     {
@@ -364,7 +380,9 @@ unsigned long long int box_id :
     __device__ __forceinline__ void sync_subwarp(const int sublane_id)
     {
         used_color_structure_64_bit_colorbox<N_COLORBOXES_BITS> tmps;
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
+
         for (int i = WARP_SIZE / 2; i >= 1; i /= 2)
         {
             int tmp_hi = __double2hiint( __longlong_as_double( data.bitfield ) );
@@ -375,6 +393,21 @@ unsigned long long int box_id :
             tmps.data.bitfield = tmp;  //get neighbors bitfield
             aggregate(tmps);
         }
+
+#else
+        __shared__ volatile unsigned long long s_used_colors[CTA_SIZE + WARP_SIZE / 2];
+#pragma unroll
+
+        for (int i = 1; i <= WARP_SIZE / 2; i *= 2)
+        {
+            s_used_colors[threadIdx.x] = data.bitfield;
+            unsigned long long tmp = s_used_colors[threadIdx.x + i];
+            tmps.data.bitfield = tmp; //get neighbors bitfield
+
+            if (sublane_id + i < WARP_SIZE) { aggregate(tmps); }
+        }
+
+#endif
     }
 };
 
diff --git a/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu b/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu
index e0c6c52a..70787114 100644
--- a/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu
+++ b/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu
@@ -38,7 +38,6 @@
 #include <logger.h>
 #include <hash_workspace.h>
 #include <matrix_io.h>
-#include <device_properties.h>
 
 #include <amgx_types/util.h>
 
@@ -51,12 +50,13 @@ namespace aggregation
 
 #include <sm_utils.inl>
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 
-#include <hash_containers_sm70.inl> // Included inside the namespace to solve name collisions.
+#include <hash_containers_sm35.inl> // Included inside the namespace to solve name collisions.
 
 static __device__ __forceinline__ int get_work( int *queue, int warp_id, int count = 1 )
 {
+#if __CUDA_ARCH__ >= 300
     int offset = -1;
 
     if ( utils::lane_id() == 0 )
@@ -65,22 +65,23 @@ static __device__ __forceinline__ int get_work( int *queue, int warp_id, int cou
     }
 
     return utils::shfl( offset, 0 );
+#else
+    return 0;
+#endif
 }
 
 #else
 
-#include <hash_containers_sm35.inl> // Included inside the namespace to solve name collisions.
+#include <hash_containers_sm20.inl> // Included inside the namespace to solve name collisions.
 
-static __device__ __forceinline__ int get_work( int *queue, int warp_id, int count = 1 )
+static __device__ __forceinline__ int get_work( volatile int *offsets, int *queue, int warp_id, int count = 1 )
 {
-    int offset = -1;
-
     if ( utils::lane_id() == 0 )
     {
-        offset = atomicAdd( queue, count );
+        offsets[warp_id] = atomicAdd( queue, count );
     }
 
-    return utils::shfl( offset, 0 );
+    return offsets[warp_id];
 }
 
 #endif
@@ -108,6 +109,12 @@ compute_sparsity_kernel( const int  R_num_rows, // same as num_aggregates.
     const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
     // The hash keys stored in shared memory.
     __shared__ int s_keys[NUM_WARPS * SMEM_SIZE];
+#if __CUDA_ARCH__ < 300
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to vote.
+    __shared__ volatile int s_bcast_cols[CTA_SIZE];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id();
     const int lane_id = utils::lane_id();
@@ -117,9 +124,18 @@ compute_sparsity_kernel( const int  R_num_rows, // same as num_aggregates.
     // First threads load the row IDs of A needed by the CTA...
     int r_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
+#if __CUDA_ARCH__ >= 300
+    Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size );
+#else
     Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size );
+#endif
     // Loop over rows of R.
+#if __CUDA_ARCH__ >= 300
+
     for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) )
+#endif
     {
         // Make sure we have to proceed.
         if ( COUNT_ONLY )
@@ -151,6 +167,9 @@ compute_sparsity_kernel( const int  R_num_rows, // same as num_aggregates.
                 a_row_id = R_cols[r_col_it];
             }
 
+#if __CUDA_ARCH__ < 300
+            s_bcast_cols[threadIdx.x] = a_row_id;
+#endif
             const int num_rows = __popc( utils::ballot(is_active) );
 
             // Uniform loop: threads collaborate to load other elements.
@@ -160,7 +179,17 @@ compute_sparsity_kernel( const int  R_num_rows, // same as num_aggregates.
                 // Is it an active thread.
                 bool is_active_k = local_k < num_rows;
                 // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
+#if __CUDA_ARCH__ >= 300
                 const int uniform_a_row_id = utils::shfl( a_row_id, local_k );
+#else
+                int uniform_a_row_id = -1;
+
+                if ( is_active_k )
+                {
+                    uniform_a_row_id = s_bcast_cols[warp_id * WARP_SIZE + local_k];
+                }
+
+#endif
                 // Load the range of the row of B.
                 int a_col_it = 0, a_col_end = 0;
 
@@ -213,10 +242,10 @@ compute_sparsity_kernel( const int  R_num_rows, // same as num_aggregates.
 
 template< typename Value_type, int NUM_THREADS_PER_ROW, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool HAS_DIAG >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 8 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 8 )
+__launch_bounds__( CTA_SIZE, 6 )
 #endif
 void fill_A_kernel_1x1( const int  R_num_rows,
                         const int *R_rows,
@@ -240,8 +269,19 @@ void fill_A_kernel_1x1( const int  R_num_rows,
     const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
     // The hash keys stored in shared memory.
     __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
+#if __CUDA_ARCH__ >= 300
     // The hash values stored in shared memory.
     __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4];
+#else
+    // Shared memory to vote.
+    __shared__ volatile int s_bcast_row[CTA_SIZE];
+    // The hash keys stored in shared memory.
+    __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE];
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to reduce the diagonal.
+    __shared__ volatile Value_type s_diag[CTA_SIZE];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id();
     const int lane_id = utils::lane_id();
@@ -251,12 +291,24 @@ void fill_A_kernel_1x1( const int  R_num_rows,
     // First threads load the row IDs of A needed by the CTA...
     int r_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
+#if __CUDA_ARCH__ >= 300
     Hash_map<int, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE  ],
             &g_keys[r_row_id * gmem_size ],
             &s_vote[warp_id * SMEM_SIZE / 4],
             &g_vals[r_row_id * gmem_size ], gmem_size );
+#else
+    Hash_map<int, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE ],
+            &g_keys[r_row_id * gmem_size],
+            &s_vals[warp_id * SMEM_SIZE ],
+            &g_vals[r_row_id * gmem_size], gmem_size );
+#endif
     // Loop over rows of A.
+#if __CUDA_ARCH__ >= 300
+
     for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) )
+#endif
     {
         // The indices of the output row.
         int ac_col_it  = Ac_rows[r_row_id + 0];
@@ -283,6 +335,10 @@ void fill_A_kernel_1x1( const int  R_num_rows,
                 a_row_id = R_cols[r_col_it];
             }
 
+#if __CUDA_ARCH__ < 300
+            s_bcast_row[threadIdx.x] = a_row_id;
+#endif
+
             // Update the diagonal (if needed).
             if ( HAS_DIAG && is_active )
             {
@@ -296,7 +352,17 @@ void fill_A_kernel_1x1( const int  R_num_rows,
             {
                 int local_k = k + lane_id_div_num_threads;
                 // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
+#if __CUDA_ARCH__ >= 300
                 const int uniform_a_row_id = utils::shfl( a_row_id, local_k );
+#else
+                int uniform_a_row_id = -1;
+
+                if ( local_k < num_rows )
+                {
+                    uniform_a_row_id = s_bcast_row[warp_id * WARP_SIZE + local_k];
+                }
+
+#endif
                 // The range of the row of B.
                 int a_col_it = 0, a_col_end = 0;
 
@@ -342,7 +408,16 @@ void fill_A_kernel_1x1( const int  R_num_rows,
         // Update the diagonal.
         if ( HAS_DIAG )
         {
+#if __CUDA_ARCH__ >= 300
             r_diag = utils::warp_reduce<1, utils::Add>( r_diag );
+#else
+            types::util<Value_type>::volcast(r_diag, s_diag + threadIdx.x);
+#ifdef _MSC_VER
+            r_diag = utils::warp_reduce_sum<1, Value_type>(s_diag, r_diag);
+#else
+            r_diag = utils::warp_reduce<1, utils::Add>(s_diag, r_diag);
+#endif
+#endif
 
             if ( lane_id == 0 )
             {
@@ -365,10 +440,10 @@ void fill_A_kernel_1x1( const int  R_num_rows,
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template< typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool HAS_DIAG >
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-__global__ __launch_bounds__( CTA_SIZE, 8 )
-#else 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __global__ __launch_bounds__( CTA_SIZE, 8 )
+#else
+__global__ __launch_bounds__( CTA_SIZE )
 #endif
 void fill_A_kernel_4x4( const int  R_num_rows, // same as num_aggregates.
                         const int *R_rows,
@@ -391,21 +466,46 @@ void fill_A_kernel_4x4( const int  R_num_rows, // same as num_aggregates.
     const int NUM_WARPS = CTA_SIZE / WARP_SIZE;
     // The hash keys stored in shared memory.
     __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
+#if __CUDA_ARCH__ < 300
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to vote.
+    __shared__ volatile int s_bcast_cols[CTA_SIZE];
+    // The table to share aggregates.
+    __shared__ volatile int s_bcast_ac_col[CTA_SIZE];
+    // Shared memory to broadcast indices.
+    __shared__ volatile int s_bcast_ac_idx[CTA_SIZE];
+    // Shared memory to broadcast values.
+    __shared__ volatile Value_type s_bcast_ac_val[CTA_SIZE / 2];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id( );
     const int lane_id = utils::lane_id( );
     // Constants.
     const int lane_id_div_16 = lane_id / 16;
     const int lane_id_mod_16 = lane_id % 16;
+#if __CUDA_ARCH__ >= 300
     const int warp_offset = 16 * lane_id_div_16;
+#else
+    const int cta_offset = 16 * (threadIdx.x / 16);
+#endif
     // First threads load the row IDs of A needed by the CTA...
     int r_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // My index.
     Hash_index<int, SMEM_SIZE, WARP_SIZE> index( &g_idx[r_row_id * gmem_size] );
     // Create local storage for the set.
+#if __CUDA_ARCH__ >= 300
     Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size );
+#else
+    Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size );
+#endif
     // Loop over rows of R.
+#if __CUDA_ARCH__ >= 300
+
     for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) )
+#endif
     {
         // The indices of the row.
         int ac_col_it  = Ac_rows[r_row_id + 0];
@@ -470,12 +570,21 @@ void fill_A_kernel_4x4( const int  R_num_rows, // same as num_aggregates.
                 }
 
                 int ac_idx = ac_col_it + set.find_index( key, index, false );
+#if __CUDA_ARCH__ < 300
+                s_bcast_ac_col[threadIdx.x] = ac_col_id;
+                s_bcast_ac_idx[threadIdx.x] = ac_idx;
+#endif
 
                 // Iterate over the 16 items.
                 for ( int k = 0 ; k < 16 ; ++k )
                 {
+#if __CUDA_ARCH__ >= 300
                     int uniform_ac_col = utils::shfl( ac_col_id, warp_offset + k );
                     int uniform_ac_idx = utils::shfl( ac_idx,    warp_offset + k );
+#else
+                    int uniform_ac_col = s_bcast_ac_col[cta_offset + k];
+                    int uniform_ac_idx = s_bcast_ac_idx[cta_offset + k];
+#endif
 
                     // Early loop exit.
                     if ( utils::all( uniform_ac_col == -1 ) )
@@ -493,6 +602,8 @@ void fill_A_kernel_4x4( const int  R_num_rows, // same as num_aggregates.
                         a_value = A_vals[16 * uniform_a_col_it + lane_id_mod_16];
                     }
 
+#if __CUDA_ARCH__ >= 300
+
                     // Proceed diagonal if needed.
                     if ( HAS_DIAG && uniform_ac_col == r_row_id )
                     {
@@ -519,13 +630,45 @@ void fill_A_kernel_4x4( const int  R_num_rows, // same as num_aggregates.
                     {
                         Ac_vals[16 * uniform_ac_idx + lane_id_mod_16] = Ac_vals[16 * uniform_ac_idx + lane_id_mod_16] + a_value;
                     }
+
+#else
+
+                    // Update the diagonal if it is a diagonal term.
+                    if ( HAS_DIAG && uniform_ac_col == r_row_id )
+                    {
+                        ac_diag = ac_diag + a_value;
+                        uniform_ac_col = -1;
+                    }
+
+                    // Update the value.
+                    if ( uniform_ac_col != -1 )
+                    {
+                        utils::atomic_add( &Ac_vals[16 * uniform_ac_idx + lane_id_mod_16], a_value );
+                    }
+
+#endif
                 }
             }
         }
 
         if ( HAS_DIAG )
         {
+#if __CUDA_ARCH__ >= 300
             ac_diag = ac_diag + utils::shfl_xor( ac_diag, 16 );
+#else
+
+            if ( lane_id_div_16 == 1 )
+            {
+                types::util<Value_type>::volcast(ac_diag, s_bcast_ac_val + 16 * warp_id + lane_id_mod_16);
+            }
+
+//        + s_bcast_ac_val[16*warp_id + lane_id_mod_16] = ac_diag)
+            if ( lane_id_div_16 == 0 )
+            {
+                ac_diag = ac_diag + types::util<Value_type>::volcast(s_bcast_ac_val[16 * warp_id + lane_id_mod_16]);
+            }
+
+#endif
 
             if ( lane_id_div_16 == 0 )
             {
@@ -538,10 +681,10 @@ void fill_A_kernel_4x4( const int  R_num_rows, // same as num_aggregates.
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template< typename Value_type, int N, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool HAS_DIAG, bool FORCE_DETERMINISM >
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __global__ __launch_bounds__( CTA_SIZE, 8 )
 #else
-__global__ __launch_bounds__( CTA_SIZE, 8 )
+__global__ __launch_bounds__( CTA_SIZE )
 #endif
 void fill_A_kernel_NxN( const int  R_num_rows, // same as num_aggregates.
                         const int *R_rows,
@@ -569,22 +712,46 @@ void fill_A_kernel_NxN( const int  R_num_rows, // same as num_aggregates.
     const int NUM_ITEMS_PER_WARP = T_WARP == 0 ? 1 : T_WARP;
     // The hash keys stored in shared memory.
     __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
+#if __CUDA_ARCH__ < 300
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to vote.
+    __shared__ volatile int s_bcast_cols[CTA_SIZE];
+    // The table to share aggregates.
+    __shared__ volatile int s_bcast_ac_col[CTA_SIZE];
+    // Shared memory to broadcast indices.
+    __shared__ volatile int s_bcast_ac_idx[CTA_SIZE];
+    // Shared memory to broadcast values.
+    __shared__ volatile Value_type s_bcast_ac_val[CTA_SIZE];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id( );
     const int lane_id = utils::lane_id( );
     // Constants.
     const int lane_id_div_NxN = lane_id / NxN;
     const int lane_id_mod_NxN = lane_id % NxN;
+#if __CUDA_ARCH__ >= 300
     const int warp_offset = NxN * lane_id_div_NxN;
+#else
+    const int cta_offset = warp_id * WARP_SIZE + NxN * lane_id_div_NxN;
+#endif
     // First threads load the row IDs of A needed by the CTA...
     int r_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // My index.
     Hash_index<int, SMEM_SIZE, WARP_SIZE> index( &g_idx[r_row_id * gmem_size] );
     // Create local storage for the set.
+#if __CUDA_ARCH__ >= 300
+    Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size );
+#else
     Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size );
+#endif
     // Loop over rows of R.
+#if __CUDA_ARCH__ >= 300
 
     for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) )
+#endif
     {
         // The indices of the row.
         int ac_col_it  = Ac_rows[r_row_id + 0];
@@ -651,10 +818,15 @@ void fill_A_kernel_NxN( const int  R_num_rows, // same as num_aggregates.
                 }
 
                 int ac_idx = ac_col_it + set.find_index( key, index, false );
+#if __CUDA_ARCH__ < 300
+                s_bcast_ac_col[threadIdx.x] = ac_col_id;
+                s_bcast_ac_idx[threadIdx.x] = ac_idx;
+#endif
 
                 // Iterate over the NxN items.
                 for ( int k = 0 ; k < NxN ; ++k )
                 {
+#if __CUDA_ARCH__ >= 300
                     int uniform_ac_col = utils::shfl( ac_col_id, warp_offset + k );
                     int uniform_ac_idx = utils::shfl( ac_idx,    warp_offset + k );
 
@@ -664,6 +836,17 @@ void fill_A_kernel_NxN( const int  R_num_rows, // same as num_aggregates.
                         uniform_ac_idx = -1;
                     }
 
+#else
+                    int uniform_ac_col = -1, uniform_ac_idx = -1;
+
+                    if ( lane_id_div_NxN < NUM_ITEMS_PER_WARP )
+                    {
+                        uniform_ac_col = s_bcast_ac_col[cta_offset + k];
+                        uniform_ac_idx = s_bcast_ac_idx[cta_offset + k];
+                    }
+
+#endif
+
                     // Early loop exit.
                     if ( utils::all( uniform_ac_col == -1 ) )
                     {
@@ -700,7 +883,16 @@ void fill_A_kernel_NxN( const int  R_num_rows, // same as num_aggregates.
         {
             if ( !FORCE_DETERMINISM )
             {
+#if __CUDA_ARCH__ >= 300
                 ac_diag = utils::warp_reduce<NxN, utils::Add>( ac_diag );
+#else
+                types::util<Value_type>::volcast(ac_diag, s_bcast_ac_val + threadIdx.x);
+#ifdef _MSC_VER
+                ac_diag = utils::warp_reduce_sum<NxN, Value_type>(s_bcast_ac_val, ac_diag);
+#else
+                ac_diag = utils::warp_reduce<NxN, utils::Add, Value_type>( s_bcast_ac_val, ac_diag );
+#endif
+#endif
             }
 
             if ( lane_id_div_NxN == 0 )
@@ -713,10 +905,10 @@ void fill_A_kernel_NxN( const int  R_num_rows, // same as num_aggregates.
 
 // when blocksize is larger than warp size
 template< typename Value_type, int N, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool HAS_DIAG, bool FORCE_DETERMINISM, int NUM_BLOCK_ITERS_PER_WARP>
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __global__ __launch_bounds__( CTA_SIZE, 8 )
 #else
-__global__ __launch_bounds__( CTA_SIZE, 8 )
+__global__ __launch_bounds__( CTA_SIZE )
 #endif
 void fill_A_kernel_NxN_large( const int  R_num_rows, // same as num_aggregates.
                               const int *R_rows,
@@ -743,6 +935,18 @@ void fill_A_kernel_NxN_large( const int  R_num_rows, // same as num_aggregates.
     const int NUM_ITEMS_PER_WARP = 1;
     // The hash keys stored in shared memory.
     __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
+#if __CUDA_ARCH__ < 300
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to vote.
+    __shared__ volatile int s_bcast_cols[CTA_SIZE];
+    // The table to share aggregates.
+    __shared__ volatile int s_bcast_ac_col[CTA_SIZE];
+    // Shared memory to broadcast indices.
+    __shared__ volatile int s_bcast_ac_idx[CTA_SIZE];
+    // Shared memory to broadcast values.
+    __shared__ volatile Value_type s_bcast_ac_val[CTA_SIZE];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id( );
     const int lane_id = utils::lane_id( );
@@ -751,9 +955,18 @@ void fill_A_kernel_NxN_large( const int  R_num_rows, // same as num_aggregates.
     // My index.
     Hash_index<int, SMEM_SIZE, WARP_SIZE> index( &g_idx[r_row_id * gmem_size] );
     // Create local storage for the set.
+#if __CUDA_ARCH__ >= 300
+    Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size );
+#else
     Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size );
+#endif
     // Loop over rows of R.
+#if __CUDA_ARCH__ >= 300
+
     for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) )
+#endif
     {
         // The indices of the row.
         int ac_col_it  = Ac_rows[r_row_id + 0];
@@ -808,12 +1021,21 @@ void fill_A_kernel_NxN_large( const int  R_num_rows, // same as num_aggregates.
                 }
 
                 int ac_idx = ac_col_it + set.find_index( key, index, false );
+#if __CUDA_ARCH__ < 300
+                s_bcast_ac_col[threadIdx.x] = ac_col_id;
+                s_bcast_ac_idx[threadIdx.x] = ac_idx;
+#endif
 
                 // Iterate over the NxN items.
                 for ( int k = 0 ; k < NxN ; ++k )
                 {
+#if __CUDA_ARCH__ >= 300
                     int uniform_ac_col = utils::shfl( ac_col_id, k );
                     int uniform_ac_idx = utils::shfl( ac_idx,    k );
+#else
+                    int uniform_ac_col = s_bcast_ac_col[k];
+                    int uniform_ac_idx = s_bcast_ac_idx[k];
+#endif
 
                     // Early loop exit.
                     if ( utils::all( uniform_ac_col == -1 ) )
@@ -857,7 +1079,16 @@ void fill_A_kernel_NxN_large( const int  R_num_rows, // same as num_aggregates.
         {
             if ( !FORCE_DETERMINISM )
             {
+#if __CUDA_ARCH__ >= 300
                 ac_diag = utils::warp_reduce<NxN, utils::Add>( ac_diag );
+#else
+                types::util<Value_type>::volcast(ac_diag, s_bcast_ac_val + threadIdx.x);
+#ifdef _MSC_VER
+                ac_diag = utils::warp_reduce_sum<NxN, Value_type>(s_bcast_ac_val, ac_diag);
+#else
+                ac_diag = utils::warp_reduce<NxN, utils::Add, Value_type>( s_bcast_ac_val, ac_diag );
+#endif
+#endif
             }
 
             Ac_vals[NxN * Ac_diag[r_row_id] + lane_id] = ac_diag;
diff --git a/core/src/aggregation/selectors/parallel_greedy_selector.cu b/core/src/aggregation/selectors/parallel_greedy_selector.cu
index d54e5dac..4a52cca3 100755
--- a/core/src/aggregation/selectors/parallel_greedy_selector.cu
+++ b/core/src/aggregation/selectors/parallel_greedy_selector.cu
@@ -189,6 +189,7 @@ void compute_ring_leader( const int A_num_rows,
                           int *__restrict out_leader_id,
                           int *__restrict out_leader_hash )
 {
+#if __CUDA_ARCH__ >= 300
     const int NUM_WARPS_PER_CTA = CTA_SIZE  / WARP_SIZE;
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id();
@@ -289,6 +290,8 @@ void compute_ring_leader( const int A_num_rows,
             out_leader_hash[a_row_id] = my_max_hash;
         }
     }
+
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -308,6 +311,7 @@ void build_aggregates( const int num_rings,
                        int *__restrict aggregates,
                        int *__restrict num_unaggregated)
 {
+#if __CUDA_ARCH__ >= 300
     const int NUM_WARPS_PER_CTA = CTA_SIZE  / WARP_SIZE;
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id();
@@ -447,6 +451,8 @@ void build_aggregates( const int num_rings,
             curr_row = max_id;
         }
     }
+
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/core/src/classical/interpolators/distance2.cu b/core/src/classical/interpolators/distance2.cu
index 25e60cf7..1825fb31 100644
--- a/core/src/classical/interpolators/distance2.cu
+++ b/core/src/classical/interpolators/distance2.cu
@@ -598,34 +598,35 @@ void Distance2_Interpolator<TemplateConfig<AMGX_host, t_vecPrec, t_matPrec, t_in
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace distance2_sm35
+namespace distance2_sm20
 {
 
 #include <sm_utils.inl>
-#include <hash_containers_sm35.inl> // Included inside the namespace to solve name colisions.
+#include <hash_containers_sm20.inl> // Included inside the namespace to solve name colisions.
 
-__device__ __forceinline__ int get_work( int *queue, int warp_id )
+__device__ __forceinline__ int get_work( volatile int *offsets, int *queue, int warp_id )
 {
-    int offset = -1;
-
     if ( utils::lane_id() == 0 )
     {
-        offset = atomicAdd( queue, 1 );
+        offsets[warp_id] = atomicAdd( queue, 1 );
     }
 
-    return utils::shfl( offset, 0 );
+    return offsets[warp_id];
 }
 
-} // namespace distance2_sm35
+} // namespace distance2_sm20
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace distance2_sm70
+namespace distance2_sm35
 {
 
 #include <sm_utils.inl>
-#include <hash_containers_sm70.inl> // Included inside the namespace to solve name colisions.
+#include <hash_containers_sm35.inl> // Included inside the namespace to solve name colisions.
 
 __device__ __forceinline__ int get_work( int *queue, int warp_id )
 {
+#if __CUDA_ARCH__ >= 300
     int offset = -1;
 
     if ( utils::lane_id() == 0 )
@@ -634,9 +635,12 @@ __device__ __forceinline__ int get_work( int *queue, int warp_id )
     }
 
     return utils::shfl( offset, 0 );
+#else
+    return 0;
+#endif
 }
 
-} // namespace distance2_sm70
+} // namespace distance2_sm35
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -909,22 +913,30 @@ compute_c_hat_kernel( int A_num_rows,
     __shared__ volatile int s_b_row_ids[CTA_SIZE];
     // The hash keys stored in shared memory.
     __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
+#if __CUDA_ARCH__ >= 300
+#else
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id( );
     const int lane_id = utils::lane_id( );
     // First threads load the row IDs of A needed by the CTA...
     int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
-#if __CUDA_ARCH__ >= 700
-    distance2_sm70::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-#else
+#if __CUDA_ARCH__ >= 300
     distance2_sm35::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
+#else
+    distance2_sm20::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
 #endif
     // Loop over rows of A.
-#if __CUDA_ARCH__ >= 700
-    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) )
-#else
+#if __CUDA_ARCH__ >= 300
+
     for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) )
 #endif
     {
         // Skip coarse rows.
@@ -939,6 +951,7 @@ compute_c_hat_kernel( int A_num_rows,
         set.clear();
         // Load the range of the row.
         __syncthreads();
+#if __CUDA_ARCH__ >= 300
         int a_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -948,6 +961,16 @@ compute_c_hat_kernel( int A_num_rows,
 
         int a_col_begin = utils::shfl( a_col_tmp, 0 );
         int a_col_end   = utils::shfl( a_col_tmp, 1 );
+#else
+
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
+
+        int a_col_begin = s_rows[2 * warp_id + 0];
+        int a_col_end   = s_rows[2 * warp_id + 1];
+#endif
         __syncthreads();
 
         // _iterate over the columns of A to build C_hat.
@@ -1069,7 +1092,13 @@ compute_c_hat_kernel( int A_num_rows,
     __shared__ volatile int s_b_row_ids[CTA_SIZE];
     // The hash keys stored in shared memory.
     __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE];
-
+#if __CUDA_ARCH__ >= 300
+#else
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id( );
     const int lane_id = utils::lane_id( );
@@ -1079,16 +1108,17 @@ compute_c_hat_kernel( int A_num_rows,
     // First threads load the row IDs of A needed by the CTA...
     int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
-#if __CUDA_ARCH__ >= 700
-    distance2_sm70::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-#else
+#if __CUDA_ARCH__ >= 300
     distance2_sm35::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
+#else
+    distance2_sm20::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
 #endif
     // Loop over rows of A.
-#if __CUDA_ARCH__ >= 700
-    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) )
-#else
+#if __CUDA_ARCH__ >= 300
+
     for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) )
 #endif
     {
         // Skip coarse rows.
@@ -1101,8 +1131,8 @@ compute_c_hat_kernel( int A_num_rows,
 
         // Clear the set.
         set.clear();
-
         // Load the range of the row.
+#if __CUDA_ARCH__ >= 300
         int a_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -1112,6 +1142,16 @@ compute_c_hat_kernel( int A_num_rows,
 
         int a_col_begin = utils::shfl( a_col_tmp, 0 );
         int a_col_end   = utils::shfl( a_col_tmp, 1 );
+#else
+
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
+
+        int a_col_begin = s_rows[2 * warp_id + 0];
+        int a_col_end   = s_rows[2 * warp_id + 1];
+#endif
 
         // _iterate over the columns of A to build C_hat.
         for ( int a_col_it = a_col_begin + lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE )
@@ -1250,22 +1290,32 @@ compute_inner_sum_kernel( const int A_num_rows,
     __shared__ volatile int s_b_row_ids[CTA_SIZE];
     // A shared location where threads propose a value.
     __shared__ volatile Value_type s_a_values[CTA_SIZE];
+#if __CUDA_ARCH__ >= 300
+#else
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS];
+    // The sign of the diagonal.
+    __shared__ volatile bool s_sign_diag[NUM_WARPS];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id();
     const int lane_id = utils::lane_id();
     // First threads load the row IDs of A needed by the CTA...
     int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
-#if __CUDA_ARCH__ >= 700
-    distance2_sm70::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-#else
+#if __CUDA_ARCH__ >= 300
     distance2_sm35::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
+#else
+    distance2_sm20::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
 #endif
     // Loop over rows of A.
-#if __CUDA_ARCH__ >= 700
-    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) )
-#else
+#if __CUDA_ARCH__ >= 300
+
     for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) )
 #endif
     {
         // Skip coarse rows.
@@ -1286,6 +1336,7 @@ compute_inner_sum_kernel( const int A_num_rows,
         int inner_sum_offset = inner_sum_offsets[a_row_id];
         // And share the value of the diagonal.
         bool sign_diag = false;
+#if __CUDA_ARCH__ >= 300
 
         if ( lane_id == 0 )
         {
@@ -1293,8 +1344,17 @@ compute_inner_sum_kernel( const int A_num_rows,
         }
 
         sign_diag = utils::shfl( sign_diag, 0 );
+#else
+
+        if ( lane_id == 0 )
+        {
+            s_sign_diag[warp_id] = sign( diag[a_row_id] );
+        }
 
+        sign_diag = s_sign_diag[warp_id];
+#endif
         // Load A row IDs.
+#if __CUDA_ARCH__ >= 300
         int a_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -1304,6 +1364,16 @@ compute_inner_sum_kernel( const int A_num_rows,
 
         int a_col_it  = utils::shfl( a_col_tmp, 0 );
         int a_col_end = utils::shfl( a_col_tmp, 1 );
+#else
+
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
+
+        int a_col_it  = s_rows[2 * warp_id + 0];
+        int a_col_end = s_rows[2 * warp_id + 1];
+#endif
 
         // Iterate over the columns of A.
         for ( a_col_it += lane_id ; utils::any( a_col_it < a_col_end ) ; a_col_it += WARP_SIZE )
@@ -1382,7 +1452,9 @@ compute_inner_sum_kernel( const int A_num_rows,
                 }
 
                 // Reduce the row to a single value.
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
+
                 for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 )
                 {
                     int hi = __double2hiint(bottom_sum);
@@ -1393,6 +1465,18 @@ compute_inner_sum_kernel( const int A_num_rows,
                 }
 
                 bottom_sum = utils::shfl( bottom_sum, 0 );
+#else
+                s_a_values[threadIdx.x] = bottom_sum;
+#pragma unroll
+
+                for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 )
+                    if ( lane_id < offset )
+                    {
+                        s_a_values[threadIdx.x] = bottom_sum += s_a_values[threadIdx.x + offset];
+                    }
+
+                bottom_sum = s_a_values[warp_id * WARP_SIZE];
+#endif
 
                 if ( lane_id == k && bottom_sum != Value_type(0) )
                 {
@@ -1441,6 +1525,15 @@ compute_inner_sum_kernel( const int A_num_rows,
     __shared__ volatile int s_b_row_ids[CTA_SIZE];
     // A shared location where threads propose a value.
     __shared__ volatile Value_type s_a_values[CTA_SIZE];
+#if __CUDA_ARCH__ >= 300
+#else
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS];
+    // The sign of the diagonal.
+    __shared__ volatile bool s_sign_diag[NUM_WARPS];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id();
     const int lane_id = utils::lane_id();
@@ -1450,16 +1543,17 @@ compute_inner_sum_kernel( const int A_num_rows,
     // First threads load the row IDs of A needed by the CTA...
     int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
-#if __CUDA_ARCH__ >= 700
-    distance2_sm70::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-#else
+#if __CUDA_ARCH__ >= 300
     distance2_sm35::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
+#else
+    distance2_sm20::Hash_set<int, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
 #endif
     // Loop over rows of A.
-#if __CUDA_ARCH__ >= 700
-    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) )
-#else
+#if __CUDA_ARCH__ >= 300
+
     for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) )
 #endif
     {
         // Skip coarse rows.
@@ -1480,6 +1574,7 @@ compute_inner_sum_kernel( const int A_num_rows,
         int inner_sum_offset = inner_sum_offsets[a_row_id];
         // And share the value of the diagonal.
         bool sign_diag = false;
+#if __CUDA_ARCH__ >= 300
 
         if ( lane_id == 0 )
         {
@@ -1496,6 +1591,23 @@ compute_inner_sum_kernel( const int A_num_rows,
 
         int a_col_it  = utils::shfl( a_col_tmp, 0 );
         int a_col_end = utils::shfl( a_col_tmp, 1 );
+#else
+
+        if ( lane_id == 0 )
+        {
+            s_sign_diag[warp_id] = sign( diag[a_row_id] );
+        }
+
+        sign_diag = s_sign_diag[warp_id];
+
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
+
+        int a_col_it  = s_rows[2 * warp_id + 0];
+        int a_col_end = s_rows[2 * warp_id + 1];
+#endif
 
         // Iterate over the columns of A.
         for ( a_col_it += lane_id ; utils::any( a_col_it < a_col_end ) ; a_col_it += WARP_SIZE )
@@ -1588,7 +1700,9 @@ compute_inner_sum_kernel( const int A_num_rows,
                 }
 
                 // Reduce the row to a single value.
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
+
                 for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 )
                 {
                     int hi = __double2hiint(bottom_sum);
@@ -1599,6 +1713,18 @@ compute_inner_sum_kernel( const int A_num_rows,
                 }
 
                 bottom_sum = utils::shfl( bottom_sum, lane_id_mod_num_threads * NUM_THREADS_PER_ROW );
+#else
+                s_a_values[threadIdx.x] = bottom_sum;
+#pragma unroll
+
+                for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 )
+                    if ( (lane_id & (NUM_THREADS_PER_ROW - 1)) < offset )
+                    {
+                        s_a_values[threadIdx.x] = bottom_sum += s_a_values[threadIdx.x + offset];
+                    }
+
+                bottom_sum = s_a_values[warp_id * WARP_SIZE + lane_id_mod_num_threads * NUM_THREADS_PER_ROW];
+#endif
 
                 if ( lane_id >= k && lane_id < k + NUM_THREADS_PER_ROW && bottom_sum != Value_type(0) )
                 {
@@ -1650,11 +1776,18 @@ compute_interp_weight_kernel( const int A_num_rows,
     __shared__ volatile int s_b_row_ids[CTA_SIZE];
     // A shared location where threads propose a value.
     __shared__ volatile Value_type s_aki[NUM_WARPS];
+#if __CUDA_ARCH__ >= 300
     // The hash values stored in shared memory.
-#if __CUDA_ARCH__ >= 700
-    __shared__ volatile distance2_sm70::Word s_vote[NUM_WARPS * SMEM_SIZE / 4];
-#else
     __shared__ volatile distance2_sm35::Word s_vote[NUM_WARPS * SMEM_SIZE / 4];
+#else
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS];
+    // Shared memory to store the values in the hash table.
+    __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE];
+    // Shared memory for broadcast.
+    __shared__ volatile Value_type s_a_values[CTA_SIZE];
 #endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id();
@@ -1662,24 +1795,25 @@ compute_interp_weight_kernel( const int A_num_rows,
     // First threads load the row IDs of A needed by the CTA...
     int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
-#if __CUDA_ARCH__ >= 700
-    distance2_sm70::Hash_map<int, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
+#if __CUDA_ARCH__ >= 300
+    distance2_sm35::Hash_map<int, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
             &g_keys[a_row_id * gmem_size],
             &s_vote[warp_id * SMEM_SIZE / 4],
             &g_vals[a_row_id * gmem_size],
             gmem_size );
 #else
-    distance2_sm35::Hash_map<int, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
+    distance2_sm20::Hash_map<int, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
             &g_keys[a_row_id * gmem_size],
-            &s_vote[warp_id * SMEM_SIZE / 4],
+            &s_vals[warp_id * SMEM_SIZE],
             &g_vals[a_row_id * gmem_size],
             gmem_size );
 #endif
     // Loop over rows of A.
-#if __CUDA_ARCH__ >= 700
-    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) )
-#else
+#if __CUDA_ARCH__ >= 300
+
     for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) )
 #endif
     {
         int coarse_fine_id = cf_map[a_row_id];
@@ -1709,6 +1843,7 @@ compute_interp_weight_kernel( const int A_num_rows,
         int c_hat_end = C_hat_end  [a_row_id];
         map.load( c_hat_end - c_hat_it, &C_hat[c_hat_it], &C_hat_pos[c_hat_it] );
         // Load A row IDs.
+#if __CUDA_ARCH__ >= 300
         int a_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -1718,7 +1853,16 @@ compute_interp_weight_kernel( const int A_num_rows,
 
         int a_col_it  = utils::shfl( a_col_tmp, 0 );
         int a_col_end = utils::shfl( a_col_tmp, 1 );
+#else
 
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
+
+        int a_col_it  = s_rows[2 * warp_id + 0];
+        int a_col_end = s_rows[2 * warp_id + 1];
+#endif
         // The offset in the inner sum table.
         int inner_sum_offset = inner_sum_offsets[a_row_id];
         // Weak value.
@@ -1767,6 +1911,7 @@ compute_interp_weight_kernel( const int A_num_rows,
 
             int num_rows = __popc( vote );
             // We pre-load inner sums.
+#if __CUDA_ARCH__ >= 300
             sum = Value_type(0);
 
             if ( lane_id < num_rows )
@@ -1774,6 +1919,14 @@ compute_interp_weight_kernel( const int A_num_rows,
                 sum = inner_sum[inner_sum_offset + lane_id];
             }
 
+#else
+
+            if ( lane_id < num_rows )
+            {
+                s_a_values[threadIdx.x] = inner_sum[inner_sum_offset + lane_id];
+            }
+
+#endif
             inner_sum_offset += num_rows;
 
             // For each warp, we have up to 32 rows of B to proceed.
@@ -1794,7 +1947,11 @@ compute_interp_weight_kernel( const int A_num_rows,
                 }
 
                 // Load the kth inner sum.
+#if __CUDA_ARCH__ >= 300
                 Value_type uniform_val = utils::shfl( sum, k );
+#else
+                Value_type uniform_val = s_a_values[warp_id * WARP_SIZE + k];
+#endif
 
                 // _iterate over the range of columns of B.
                 for ( int b_col_it = b_col_begin + lane_id ; utils::any( b_col_it < b_col_end ) ; b_col_it += WARP_SIZE )
@@ -1834,6 +1991,7 @@ compute_interp_weight_kernel( const int A_num_rows,
         }
 
         // We're done with that row of A. We compute D.
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
 
         for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 )
@@ -1848,7 +2006,27 @@ compute_interp_weight_kernel( const int A_num_rows,
         }
 
         sum = utils::shfl( sum, 0 );
+#else
+        s_a_values[threadIdx.x] = weak;
+#pragma unroll
+
+        for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 )
+            if ( lane_id < offset )
+            {
+                s_a_values[threadIdx.x] = weak += s_a_values[threadIdx.x + offset];
+            }
+
+        weak = s_a_values[warp_id * WARP_SIZE];
+
+        if ( lane_id == 0 )
+        {
+            weak += diag[a_row_id];
+            s_a_values[threadIdx.x] = Value_type(-1) / weak;
+        }
 
+        sum = s_a_values[warp_id * WARP_SIZE];
+#endif
+#if __CUDA_ARCH__ >= 300
         int p_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -1858,7 +2036,16 @@ compute_interp_weight_kernel( const int A_num_rows,
 
         int p_col_it  = utils::shfl( p_col_tmp, 0 );
         int p_col_end = utils::shfl( p_col_tmp, 1 );
+#else
 
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = P_rows[a_row_id + lane_id];
+        }
+
+        int p_col_it  = s_rows[2 * warp_id + 0];
+        int p_col_end = s_rows[2 * warp_id + 1];
+#endif
         map.store_map_keys_scale_values( p_col_end - p_col_it, cf_map, &P_cols[p_col_it], sum, &P_vals[p_col_it] );
     }
 }
diff --git a/core/src/classical/interpolators/multipass.cu b/core/src/classical/interpolators/multipass.cu
index ce2fd6e3..1df77ba5 100644
--- a/core/src/classical/interpolators/multipass.cu
+++ b/core/src/classical/interpolators/multipass.cu
@@ -270,34 +270,35 @@ void Multipass_Interpolator<TemplateConfig<AMGX_host, t_vecPrec, t_matPrec, t_in
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace multipass_sm35
+namespace multipass_sm20
 {
 
 #include <sm_utils.inl>
-#include <hash_containers_sm35.inl> // Included inside the namespace to solve name colisions.
+#include <hash_containers_sm20.inl> // Included inside the namespace to solve name colisions.
 
-__device__ __forceinline__ int get_work( int *queue, int warp_id )
+__device__ __forceinline__ int get_work( volatile int *offsets, int *queue, int warp_id )
 {
-    int offset = -1;
-
     if ( utils::lane_id() == 0 )
     {
-        offset = atomicAdd( queue, 1 );
+        offsets[warp_id] = atomicAdd( queue, 1 );
     }
 
-    return utils::shfl( offset, 0 );
+    return offsets[warp_id];
 }
 
-} // namespace multipass_sm35
+} // namespace multipass_sm20
 
-namespace multipass_sm70
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace multipass_sm35
 {
 
 #include <sm_utils.inl>
-#include <hash_containers_sm70.inl> // Included inside the namespace to solve name colisions.
+#include <hash_containers_sm35.inl> // Included inside the namespace to solve name colisions.
 
 __device__ __forceinline__ int get_work( int *queue, int warp_id )
 {
+#if __CUDA_ARCH__ >= 300
     int offset = -1;
 
     if ( utils::lane_id() == 0 )
@@ -306,9 +307,12 @@ __device__ __forceinline__ int get_work( int *queue, int warp_id )
     }
 
     return utils::shfl( offset, 0 );
+#else
+    return 0;
+#endif
 }
 
-} // namespace multipass_sm70
+} // namespace multipass_sm35
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -330,6 +334,9 @@ estimate_c_hat_size_kernel( const int A_num_rows,
                             int *C_hat_offsets,
                             int pass )
 {
+#if __CUDA_ARCH__ < 300
+    __shared__ volatile int s_mem[CTA_SIZE];
+#endif
     const int NUM_WARPS_PER_CTA = CTA_SIZE / WARP_SIZE;
     // Number of items per grid.
     const int NUM_WARPS_PER_GRID = gridDim.x * NUM_WARPS_PER_CTA;
@@ -378,6 +385,7 @@ estimate_c_hat_size_kernel( const int A_num_rows,
         }
 
         // Do reduction
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
 
         for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 )
@@ -385,6 +393,18 @@ estimate_c_hat_size_kernel( const int A_num_rows,
             my_count += utils::shfl_xor( my_count, mask );
         }
 
+#else
+        s_mem[threadIdx.x] = my_count;
+#pragma unroll
+
+        for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 )
+            if ( lane_id < offset )
+            {
+                s_mem[threadIdx.x] = my_count += s_mem[threadIdx.x + offset];
+            }
+
+#endif
+
         // Write result -- RACE CONDITION!
         if (lane_id == 0)
         {
@@ -408,6 +428,11 @@ compute_c_hat_first_pass_kernel( int A_num_rows,
 {
     const int NUM_WARPS_PER_CTA = CTA_SIZE / WARP_SIZE;
     const int NUM_WARPS_PER_GRID = gridDim.x * NUM_WARPS_PER_CTA;
+#if __CUDA_ARCH__ >= 300
+#else
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS_PER_CTA];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id( );
     const int lane_id = utils::lane_id( );
@@ -424,6 +449,7 @@ compute_c_hat_first_pass_kernel( int A_num_rows,
         }
 
         // Load the range of the row.
+#if __CUDA_ARCH__ >= 300
         int a_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -433,7 +459,16 @@ compute_c_hat_first_pass_kernel( int A_num_rows,
 
         int a_col_begin = utils::shfl( a_col_tmp, 0 );
         int a_col_end   = utils::shfl( a_col_tmp, 1 );
+#else
+
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
 
+        int a_col_begin = s_rows[2 * warp_id + 0];
+        int a_col_end   = s_rows[2 * warp_id + 1];
+#endif
         int count = 0;
         int c_col_it = C_hat_start[a_row_id];
 
@@ -506,6 +541,13 @@ compute_c_hat_kernel( int A_num_rows,
     __shared__ volatile int s_b_row_ids[CTA_SIZE];
     // The hash keys stored in shared memory.
     __shared__ volatile KeyType s_keys[NUM_WARPS * SMEM_SIZE];
+#if __CUDA_ARCH__ >= 300
+#else
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id( );
     const int lane_id = utils::lane_id( );
@@ -515,16 +557,17 @@ compute_c_hat_kernel( int A_num_rows,
     // First threads load the row IDs of A needed by the CTA...
     int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
-#if __CUDA_ARCH__ >= 700
-    multipass_sm70::Hash_set<KeyType, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
-#else
+#if __CUDA_ARCH__ >= 300
     multipass_sm35::Hash_set<KeyType, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
+#else
+    multipass_sm20::Hash_set<KeyType, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size );
 #endif
     // Loop over rows of A.
-#if __CUDA_ARCH__ >= 700
-    for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm70::get_work( wk_work_queue, warp_id ) )
-#else
+#if __CUDA_ARCH__ >= 300
+
     for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm35::get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm20::get_work( s_offsets, wk_work_queue, warp_id ) )
 #endif
     {
         // Skip if not in current pass
@@ -536,6 +579,7 @@ compute_c_hat_kernel( int A_num_rows,
         // Clear the set.
         set.clear();
         // Load the range of the row.
+#if __CUDA_ARCH__ >= 300
         int a_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -545,6 +589,16 @@ compute_c_hat_kernel( int A_num_rows,
 
         int a_col_begin = utils::shfl( a_col_tmp, 0 );
         int a_col_end   = utils::shfl( a_col_tmp, 1 );
+#else
+
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
+
+        int a_col_begin = s_rows[2 * warp_id + 0];
+        int a_col_end   = s_rows[2 * warp_id + 1];
+#endif
 
         // _iterate over the columns of A to build C_hat.
         for ( int a_col_it = a_col_begin + lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE )
@@ -658,7 +712,14 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows,
 {
     const int NUM_WARPS_PER_CTA = CTA_SIZE / 32;
     const int NUM_WARPS_PER_GRID = gridDim.x * NUM_WARPS_PER_CTA;
-
+#if __CUDA_ARCH__ >= 300
+#else
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS_PER_CTA];
+    // Shared memory for broadcast.
+    __shared__ volatile Value_type s_n_values[CTA_SIZE];
+    __shared__ volatile Value_type s_c_values[CTA_SIZE];
+#endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id();
     const int lane_id = utils::lane_id();
@@ -690,6 +751,7 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows,
         else if (assigned[a_row_id] == 1)
         {
             // Load A row IDs.
+#if __CUDA_ARCH__ >= 300
             int a_col_tmp = -1;
 
             if ( lane_id < 2 )
@@ -699,7 +761,17 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows,
 
             int a_col_it  = utils::shfl( a_col_tmp, 0 );
             int a_col_end = utils::shfl( a_col_tmp, 1 );
+#else
+
+            if ( lane_id < 2 )
+            {
+                s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+            }
 
+            int a_col_it  = s_rows[2 * warp_id + 0];
+            int a_col_end = s_rows[2 * warp_id + 1];
+#endif
+#if __CUDA_ARCH__ >= 300
             int p_col_tmp = -1;
 
             if ( lane_id < 2 )
@@ -709,7 +781,16 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows,
 
             int p_col_it  = utils::shfl( p_col_tmp, 0 );
             int p_col_end = utils::shfl( p_col_tmp, 1 );
+#else
 
+            if ( lane_id < 2 )
+            {
+                s_rows[2 * warp_id + lane_id] = P_rows[a_row_id + lane_id];
+            }
+
+            int p_col_it  = s_rows[2 * warp_id + 0];
+            int p_col_end = s_rows[2 * warp_id + 1];
+#endif
             // Weak value.
             Value_type sum_N(0), sum_C(0), alfa(0);
             int count = 0;
@@ -763,7 +844,9 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows,
             }
 
             // We're done with that row of A, now reduce sum_N and sum_C
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
+
             for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 )
             {
                 sum_C += utils::shfl_xor( sum_C, mask );
@@ -777,6 +860,33 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows,
             }
 
             alfa = utils::shfl( alfa, 0 );
+#else
+            s_c_values[threadIdx.x] = sum_C;
+            s_n_values[threadIdx.x] = sum_N;
+#pragma unroll
+
+            for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 )
+                if ( lane_id < offset )
+                {
+                    s_c_values[threadIdx.x] = sum_C += s_c_values[threadIdx.x + offset];
+                    s_n_values[threadIdx.x] = sum_N += s_n_values[threadIdx.x + offset];
+                }
+
+            sum_C = s_c_values[warp_id * WARP_SIZE];
+            sum_N = s_n_values[warp_id * WARP_SIZE];
+
+            if ( lane_id == 0 )
+            {
+                if (fabs(sum_C * diag[a_row_id]) < 1e-10) { printf("Dividing by zero\n"); }
+
+                double div = (fabs(sum_C * diag[a_row_id]) < 1e-10) ? 1. : sum_C * diag[a_row_id];
+                alfa = -sum_N / div;
+                // alfa = -sum_N/(sum_C*diag[a_row_id]);
+                s_c_values[threadIdx.x] = alfa;
+            }
+
+            alfa = s_c_values[warp_id * WARP_SIZE];
+#endif
 
             // Scale the value of P
             for ( p_col_it += lane_id ; utils::any( p_col_it < p_col_end ) ; p_col_it += WARP_SIZE )
@@ -791,6 +901,8 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows,
     } // Loop over rows
 }
 
+
+
 template< typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, typename KeyType >
 __global__ __launch_bounds__( CTA_SIZE )
 void
@@ -815,6 +927,7 @@ compute_interp_weight_kernel( const int A_num_rows,
                               const int *__restrict assigned,
                               int pass)
 {
+#if 1
     const int NUM_WARPS = CTA_SIZE / 32;
     // The hash keys stored in shared memory.
     __shared__ volatile KeyType s_keys[NUM_WARPS * SMEM_SIZE];
@@ -822,11 +935,18 @@ compute_interp_weight_kernel( const int A_num_rows,
     __shared__ volatile int s_b_row_ids[CTA_SIZE];
     // A shared location where threads store a value of B to load.
     __shared__ volatile Value_type s_b_values[CTA_SIZE];
+#if __CUDA_ARCH__ >= 300
     // The hash values stored in shared memory.
-#if __CUDA_ARCH__ >= 700
-    __shared__ volatile multipass_sm70::Word s_vote[NUM_WARPS * SMEM_SIZE / 4];
-#else
     __shared__ volatile multipass_sm35::Word s_vote[NUM_WARPS * SMEM_SIZE / 4];
+#else
+    // Shared memory to acquire work.
+    __shared__ volatile int s_offsets[NUM_WARPS];
+    // Shared memory to store where to load from.
+    __shared__ volatile int s_rows[2 * NUM_WARPS];
+    // Shared memory to store the values in the hash table.
+    __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE];
+    __shared__ volatile Value_type s_n_values[CTA_SIZE];
+    __shared__ volatile Value_type s_c_values[CTA_SIZE];
 #endif
     // The coordinates of the thread inside the CTA/warp.
     const int warp_id = utils::warp_id();
@@ -834,24 +954,25 @@ compute_interp_weight_kernel( const int A_num_rows,
     // First threads load the row IDs of A needed by the CTA...
     volatile int a_row_id = blockIdx.x * NUM_WARPS + warp_id;
     // Create local storage for the set.
-#if __CUDA_ARCH__ >= 700
-    multipass_sm70::Hash_map<KeyType, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
+#if __CUDA_ARCH__ >= 300
+    multipass_sm35::Hash_map<KeyType, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
             &g_keys[a_row_id * gmem_size],
             &s_vote[warp_id * SMEM_SIZE / 4],
             &g_vals[a_row_id * gmem_size],
             gmem_size );
 #else
-    multipass_sm35::Hash_map<KeyType, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
+    multipass_sm20::Hash_map<KeyType, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id * SMEM_SIZE],
             &g_keys[a_row_id * gmem_size],
-            &s_vote[warp_id * SMEM_SIZE / 4],
+            &s_vals[warp_id * SMEM_SIZE],
             &g_vals[a_row_id * gmem_size],
             gmem_size );
 #endif
     // Loop over rows of A.
-#if __CUDA_ARCH__ >= 700
-    for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm70::get_work( wk_work_queue, warp_id ) )
-#else
+#if __CUDA_ARCH__ >= 300
+
     for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm35::get_work( wk_work_queue, warp_id ) )
+#else
+    for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm20::get_work( s_offsets, wk_work_queue, warp_id ) )
 #endif
     {
         // Only do work if assigned[row_id] == pass
@@ -867,6 +988,7 @@ compute_interp_weight_kernel( const int A_num_rows,
         int c_hat_end = c_hat_it + C_hat_size[a_row_id];
         map.load( c_hat_end - c_hat_it, &C_hat[c_hat_it], &C_hat_pos[c_hat_it] );
         // Load A row IDs.
+#if __CUDA_ARCH__ >= 300
         int a_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -876,7 +998,16 @@ compute_interp_weight_kernel( const int A_num_rows,
 
         int a_col_it  = utils::shfl( a_col_tmp, 0 );
         int a_col_end = utils::shfl( a_col_tmp, 1 );
+#else
 
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id];
+        }
+
+        int a_col_it  = s_rows[2 * warp_id + 0];
+        int a_col_end = s_rows[2 * warp_id + 1];
+#endif
         // sums
         Value_type sum_N(0), sum_C(0), alfa(0);
 
@@ -957,7 +1088,9 @@ compute_interp_weight_kernel( const int A_num_rows,
         }
 
         // We're done with that row of A, now reduce sum_N and sum_C
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
+
         for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 )
         {
             sum_C += utils::shfl_xor( sum_C, mask );
@@ -972,7 +1105,31 @@ compute_interp_weight_kernel( const int A_num_rows,
         }
 
         alfa = utils::shfl( alfa, 0 );
+#else
+        s_c_values[threadIdx.x] = sum_C;
+        s_n_values[threadIdx.x] = sum_N;
+#pragma unroll
+
+        for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 )
+            if ( lane_id < offset )
+            {
+                s_c_values[threadIdx.x] = sum_C += s_c_values[threadIdx.x + offset];
+                s_n_values[threadIdx.x] = sum_N += s_n_values[threadIdx.x + offset];
+            }
+
+        sum_C = s_c_values[warp_id * WARP_SIZE];
+        sum_N = s_n_values[warp_id * WARP_SIZE];
+
+        if ( lane_id == 0 )
+        {
+            double div = (fabs(sum_C * diag[a_row_id]) < 1e-10) ? 1. : sum_C * diag[a_row_id];
+            alfa = -sum_N / div;
+            s_c_values[threadIdx.x] = alfa;
+        }
 
+        alfa = s_c_values[warp_id * WARP_SIZE];
+#endif
+#if __CUDA_ARCH__ >= 300
         int p_col_tmp = -1;
 
         if ( lane_id < 2 )
@@ -982,9 +1139,20 @@ compute_interp_weight_kernel( const int A_num_rows,
 
         int p_col_it  = utils::shfl( p_col_tmp, 0 );
         int p_col_end = utils::shfl( p_col_tmp, 1 );
+#else
 
+        if ( lane_id < 2 )
+        {
+            s_rows[2 * warp_id + lane_id] = P_rows[a_row_id + lane_id];
+        }
+
+        int p_col_it  = s_rows[2 * warp_id + 0];
+        int p_col_end = s_rows[2 * warp_id + 1];
+#endif
         map.store_keys_scale_values( p_col_end - p_col_it, &P_cols[p_col_it], alfa, &P_vals[p_col_it] );
     }
+
+#endif
 }
 
 } // namespace multipass
diff --git a/core/src/classical/strength/affinity.cu b/core/src/classical/strength/affinity.cu
index 8daf8eeb..bd48af03 100644
--- a/core/src/classical/strength/affinity.cu
+++ b/core/src/classical/strength/affinity.cu
@@ -137,9 +137,14 @@ void initRandom(Vector &vec, int size)
 template <class T>
 static __device__ __inline__ T cahedRead (const T *ar)
 {
+#if __CUDA_ARCH__ >= 350
     return utils::Ld<utils::LD_NC>::load(ar);
+#else
+    return utils::Ld<utils::LD_CA>::load(ar);
+#endif
 }
 
+#if __CUDA_ARCH__ >= 300
 // ! shfl is defined for int/float only in arch 600
 template< typename ValueType >
 __forceinline__ __device__ void reduce_inwarp_mul(
@@ -160,6 +165,33 @@ __forceinline__ __device__ void reduce_inwarp_mul(
 
     *s_xy = sum;
 }
+#else
+template< typename ValueType >
+__forceinline__ __device__ void reduce_inwarp_mul(
+    const ValueType vValueA,
+    const ValueType vValueB,
+    const int vecId,
+    const int n_per_warp,
+    volatile ValueType *smem,
+    double *s_xy)
+{
+    ValueType sum = vValueA * vValueB;
+    smem[threadIdx.x] = sum;
+#pragma unroll
+
+    for ( int offset = 16 / n_per_warp ; offset > 0 ; offset /= 2 )
+        if ( vecId < offset )
+        {
+            smem[threadIdx.x] = sum = sum + smem[threadIdx.x + offset];
+        }
+
+    if ( vecId == 0 )
+    {
+        // If laneId=0, then sum is in smem[threadIdx.x].
+        *s_xy = sum;
+    }
+}
+#endif
 
 template< typename ValueType >
 __forceinline__ __device__ void reduce_inblock_mul(
@@ -240,7 +272,11 @@ void computeAffinity_1x1_Kernel(const int *A_rows,
     const int num_vecs_per_warp = 32 / nTV;
     const int num_rows_per_iter = gridDim.x * blockDim.x / nTV;
     //const int num_vecs_per_block = blockDim.x / nTV;
+#if __CUDA_ARCH__ >= 300
     ValueTypeB *smem = NULL;
+#else
+    __shared__ volatile ValueTypeB smem[kCtaSize];
+#endif
     double s_xx, s_xy, s_yy;
 
     for ( int aRowId = tid / nTV ; aRowId < A_num_rows ;
diff --git a/core/src/matrix_coloring/coloring_utils.cu b/core/src/matrix_coloring/coloring_utils.cu
index d6f09819..6c0b3a28 100644
--- a/core/src/matrix_coloring/coloring_utils.cu
+++ b/core/src/matrix_coloring/coloring_utils.cu
@@ -87,7 +87,11 @@ __global__ void permute_colors_kernel(int num_rows, int *row_colors, int *color_
     for ( ; row_id < num_rows ; row_id += blockDim.x * gridDim.x )
     {
         int color = row_colors[row_id];
+#if __CUDA_ARCH__ >= 350
         color = __ldg(color_permutation + color);
+#else
+        color = color_permutation[color];
+#endif
         row_colors[row_id] = color;
     }
 }
diff --git a/core/src/matrix_coloring/greedy_min_max_2ring.cu b/core/src/matrix_coloring/greedy_min_max_2ring.cu
index 5df9d46f..286f3137 100644
--- a/core/src/matrix_coloring/greedy_min_max_2ring.cu
+++ b/core/src/matrix_coloring/greedy_min_max_2ring.cu
@@ -208,6 +208,7 @@ void find_max_neighbor_kernel_and_propagate_used_colors( const int A_num_rows,
         }
 
         //BEGIN: Reduce used_colors/max_hashes amongst subwarps
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
 
         for (int i = WARP_SIZE / 2; i >= 1; i /= 2)
@@ -230,6 +231,36 @@ void find_max_neighbor_kernel_and_propagate_used_colors( const int A_num_rows,
             used_colors |= tmpu;
         }
 
+#else
+        __shared__ volatile int s_max_hash[CTA_SIZE + WARP_SIZE / 2];
+        __shared__ volatile int s_max_hash_id[CTA_SIZE + WARP_SIZE / 2];
+        __shared__ volatile long long s_used_colors[CTA_SIZE + WARP_SIZE / 2];
+#pragma unroll
+
+        for (int i = 1; i <= WARP_SIZE / 2; i *= 2)
+        {
+            s_max_hash[threadIdx.x] = max_hash;
+            s_max_hash_id[threadIdx.x] = max_hash_id;
+            int tmp    = s_max_hash[threadIdx.x + i];
+            int tmp_id = s_max_hash_id[threadIdx.x + i];
+
+            if (lane_id + i < WARP_SIZE)
+            {
+                if (tmp_id >= 0 && (max_hash_id < 0 || tmp > max_hash || (tmp == max_hash && tmp_id >= max_hash_id)))
+                {
+                    max_hash = tmp;
+                    max_hash_id = tmp_id;
+                }
+            }
+
+            ///
+            s_used_colors[threadIdx.x] = used_colors;
+            long long tmpu = s_used_colors[threadIdx.x + i];
+
+            if (lane_id + i < WARP_SIZE) { used_colors |= tmpu; }
+        }
+
+#endif
         //END: Reduce used_colors/max_hashes amongst subwarps
 
         //The subwarp leader stores the result.
@@ -324,6 +355,7 @@ void color_kernel_greedy_onlymax(
         }
 
         //reduce used colors bit by bit.
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
 
         for (int i = WARP_SIZE / 2; i >= 1; i /= 2)
@@ -336,6 +368,19 @@ void color_kernel_greedy_onlymax(
             used_colors |= tmp;
         }
 
+#else
+        __shared__ volatile long long s_used_colors[CTA_SIZE + WARP_SIZE / 2];
+#pragma unroll
+
+        for (int i = 1; i <= WARP_SIZE / 2; i *= 2)
+        {
+            s_used_colors[threadIdx.x] = used_colors;
+            long long tmp = s_used_colors[threadIdx.x + i];
+
+            if (lane_id + i < WARP_SIZE) { used_colors |= tmp; }
+        }
+
+#endif
         int my_color_1 = 64 - utils::bfind( ~used_colors );
 
         if (__popc(used_colors) >= 64 || my_color_1 > 64 || my_color_1 <= 0)
@@ -579,6 +624,7 @@ void color_kernel_greedy_gtlt(
         //is_max_vertex = row_gt_count==0;
         //is_min_vertex = false;
         //reduce used colors bit by bit.
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
 
         for (int i = WARP_SIZE / 2; i >= 1; i /= 2)
@@ -591,6 +637,19 @@ void color_kernel_greedy_gtlt(
             used_colors |= tmp;
         }
 
+#else
+        __shared__ volatile long long s_used_colors[CTA_SIZE + WARP_SIZE / 2];
+#pragma unroll
+
+        for (int i = 1; i <= WARP_SIZE / 2; i *= 2)
+        {
+            s_used_colors[threadIdx.x] = used_colors;
+            long long tmp = s_used_colors[threadIdx.x + i];
+
+            if (lane_id + i < WARP_SIZE) { used_colors |= tmp; }
+        }
+
+#endif
         int my_color_1 = 0;
         int my_color_2 = 0;
         int free_colors = __popc(used_colors);
diff --git a/core/src/matrix_coloring/min_max.cu b/core/src/matrix_coloring/min_max.cu
index 23df42cf..960e720f 100644
--- a/core/src/matrix_coloring/min_max.cu
+++ b/core/src/matrix_coloring/min_max.cu
@@ -199,6 +199,10 @@ find_min_max_neighbors_kernel( const int *__restrict A_offsets,
     const int NUM_ROWS_PER_CTA = CTA_SIZE / NUM_THREADS_PER_ROW;
     const int warp_id = threadIdx.x / NUM_THREADS_PER_ROW;
     const int lane_id = threadIdx.x % NUM_THREADS_PER_ROW;
+#if __CUDA_ARCH__ < 300
+    __shared__ volatile int s_min_hash[CTA_SIZE];
+    __shared__ volatile int s_max_hash[CTA_SIZE];
+#endif
 
     for ( int row_id = blockIdx.x * NUM_ROWS_PER_CTA + warp_id ; utils::any(row_id < num_rows) ; row_id += gridDim.x * NUM_ROWS_PER_CTA )
     {
@@ -255,13 +259,29 @@ find_min_max_neighbors_kernel( const int *__restrict A_offsets,
             }
         }
 
+#if __CUDA_ARCH__ < 300
+        s_min_hash[threadIdx.x] = min_hash;
+        s_max_hash[threadIdx.x] = max_hash;
+#pragma unroll
+
+        for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 )
+            if ( lane_id < offset )
+            {
+                s_min_hash[threadIdx.x] = min_hash = min( min_hash, s_min_hash[threadIdx.x + offset] );
+                s_max_hash[threadIdx.x] = max_hash = max( max_hash, s_max_hash[threadIdx.x + offset] );
+            }
+
+#else
 #pragma unroll
+
         for ( int mask = NUM_THREADS_PER_ROW / 2 ; mask > 0 ; mask >>= 1 )
         {
             min_hash = min( min_hash, utils::shfl_xor( min_hash, mask ) );
             max_hash = max( max_hash, utils::shfl_xor( max_hash, mask ) );
         }
 
+#endif
+
         if ( row_id < num_rows && lane_id == 0 )
         {
             max_hash_array[row_id] = max_hash;
diff --git a/core/src/matrix_coloring/min_max_2ring.cu b/core/src/matrix_coloring/min_max_2ring.cu
index ee56cc0f..6f2776e6 100644
--- a/core/src/matrix_coloring/min_max_2ring.cu
+++ b/core/src/matrix_coloring/min_max_2ring.cu
@@ -654,7 +654,9 @@ void color_kernel_greedy( const int A_num_rows, const int *A_rows, const int *A_
 
         is_min_vertex = false;
         //reduce used colors bit by bit.
+#if __CUDA_ARCH__ >= 350
 #pragma unroll
+
         for (int i = WARP_SIZE / 2; i >= 1; i /= 2)
         {
             int tmp_hi = __double2hiint( __longlong_as_double( used_colors ) );
@@ -665,6 +667,7 @@ void color_kernel_greedy( const int A_num_rows, const int *A_rows, const int *A_
             used_colors |= tmp;
         }
 
+#endif
         int my_color = 64 - utils::bfind( ~used_colors );
 
         if (my_color <= 0) { my_color = 1; }
diff --git a/core/src/scalers/nbinormalization.cu b/core/src/scalers/nbinormalization.cu
index 4af53de6..2eba79a7 100644
--- a/core/src/scalers/nbinormalization.cu
+++ b/core/src/scalers/nbinormalization.cu
@@ -169,10 +169,16 @@ void computeBetaGammaDevice(IndexType rows, IndexType *offsets, IndexType *indic
     const int vectors_per_block = VectorsPerCTA;
     const int vector_id = threadIdx.x / VectorSize;
     const int lane_id = threadIdx.x % VectorSize;
+#if __CUDA_ARCH__ >= 300
+#else
+    __shared__ IndexType row_offsets[2 * VectorsPerCTA];
+    __shared__ volatile VectorValue bi_sum[CTASize + VectorSize / 2];
+#endif
 
     for (int i = vectors_per_block * blockIdx.x + vector_id; i < rows; i += vectors_per_block * gridDim.x)
     {
         // load start + end pointers
+#if __CUDA_ARCH__ >= 300
         int row_tmp;
 
         if (lane_id < 2)
@@ -183,6 +189,16 @@ void computeBetaGammaDevice(IndexType rows, IndexType *offsets, IndexType *indic
         // distribute to all other threads in warp
         int row_begin = utils::shfl(row_tmp, vector_id * VectorSize, warpSize, utils::activemask());
         int row_end = utils::shfl(row_tmp, vector_id * VectorSize + 1, warpSize, utils::activemask());
+#else
+
+        if (lane_id < 2)
+        {
+            row_offsets[2 * vector_id + lane_id] = offsets[i + lane_id];
+        }
+
+        int row_begin = row_offsets[2 * vector_id];
+        int row_end = row_offsets[2 * vector_id + 1];
+#endif
         VectorValue bi(0.);
 
         for (int jj = row_begin + lane_id; utils::any(jj < row_end, utils::activemask()); jj += VectorSize)
@@ -200,7 +216,12 @@ void computeBetaGammaDevice(IndexType rows, IndexType *offsets, IndexType *indic
         }
 
         // reduce over bi
+#if __CUDA_ARCH__ >= 300
         VectorValue bi_s = warpReduceSum<VectorSize>(bi);
+#else
+        bi_sum[threadIdx.x] = bi;
+        VectorValue bi_s = warpReduceSumShared<VectorSize, VectorValue>(&bi_sum[vector_id * VectorSize], lane_id);
+#endif
 
         if (lane_id == 0)
         {
@@ -218,10 +239,15 @@ void computeGammaDevice(int rows, IndexType *offsets, IndexType *indices, Matrix
     const int vectors_per_block = CTASize / VectorSize;
     const int vector_id = threadIdx.x / VectorSize;
     const int lane_id = threadIdx.x % VectorSize;
+#if __CUDA_ARCH__ >= 300
+#else
+    __shared__ int row_offsets[2 * (CTASize / VectorSize)];
+#endif
 
     for (int i = vectors_per_block * blockIdx.x + vector_id; i < rows; i += vectors_per_block * gridDim.x)
     {
         // load start + end pointers
+#if __CUDA_ARCH__ >= 300
         int row_tmp;
 
         if (lane_id < 2)
@@ -232,6 +258,16 @@ void computeGammaDevice(int rows, IndexType *offsets, IndexType *indices, Matrix
         // distribute to all other threads in warp
         int row_begin = utils::shfl(row_tmp, vector_id * VectorSize, warpSize, utils::activemask());
         int row_end = utils::shfl(row_tmp, vector_id * VectorSize + 1, warpSize, utils::activemask());
+#else
+
+        if (lane_id < 2)
+        {
+            row_offsets[2 * vector_id + lane_id] = offsets[i + lane_id];
+        }
+
+        int row_begin = row_offsets[2 * vector_id];
+        int row_end = row_offsets[2 * vector_id + 1];
+#endif
 
         for (int jj = row_begin + lane_id; utils::any(jj < row_end, utils::activemask()); jj += VectorSize)
         {
@@ -257,10 +293,16 @@ void computeBetaDevice(int rows, IndexType *offsets, IndexType *indices, MatrixV
     const int vectors_per_block = CTASize / VectorSize;
     const int vector_id = threadIdx.x / VectorSize;
     const int lane_id = threadIdx.x % VectorSize;
+#if __CUDA_ARCH__ >= 300
+#else
+    __shared__ int row_offsets[2 * (CTASize / VectorSize)];
+    __shared__ volatile VectorValue bi_sum[CTASize + VectorSize / 2];
+#endif
 
     for (int i = vectors_per_block * blockIdx.x + vector_id; i < rows; i += vectors_per_block * gridDim.x)
     {
         // load start + end pointers
+#if __CUDA_ARCH__ >= 300
         int row_tmp;
 
         if (lane_id < 2)
@@ -271,6 +313,16 @@ void computeBetaDevice(int rows, IndexType *offsets, IndexType *indices, MatrixV
         // distribute to all other threads in warp
         int row_begin = utils::shfl(row_tmp, vector_id * VectorSize, warpSize, utils::activemask());
         int row_end = utils::shfl(row_tmp, vector_id * VectorSize + 1, warpSize, utils::activemask());
+#else
+
+        if (lane_id < 2)
+        {
+            row_offsets[2 * vector_id + lane_id] = offsets[i + lane_id];
+        }
+
+        int row_begin = row_offsets[2 * vector_id];
+        int row_end = row_offsets[2 * vector_id + 1];
+#endif
         VectorValue bi = 0.;
 
         for (int jj = row_begin + lane_id; utils::any(jj < row_end, utils::activemask()); jj += VectorSize)
@@ -287,7 +339,12 @@ void computeBetaDevice(int rows, IndexType *offsets, IndexType *indices, MatrixV
         }
 
         // reduce over bi
+#if __CUDA_ARCH__ >= 300
         VectorValue bi_s = warpReduceSum<VectorSize>(bi);
+#else
+        bi_sum[threadIdx.x] = bi;
+        VectorValue bi_s = warpReduceSumShared<VectorSize, VectorValue>(&bi_sum[vector_id * VectorSize], lane_id);
+#endif
 
         if (lane_id == 0)
         {
diff --git a/core/src/solvers/dense_lu_solver.cu b/core/src/solvers/dense_lu_solver.cu
index fefbd822..2fc6d8b1 100644
--- a/core/src/solvers/dense_lu_solver.cu
+++ b/core/src/solvers/dense_lu_solver.cu
@@ -136,6 +136,7 @@ void csr_to_dense_kernel(
 }
 
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 template< int N, bool ROW_MAJOR, int WARP_SIZE, typename Value_type >
 static __device__ __forceinline__
 Value_type reduce_distributed_vectors( Value_type x, int is_leader, unsigned int active_mask )
@@ -166,16 +167,53 @@ Value_type reduce_distributed_vectors( Value_type x, int is_leader, unsigned int
 
     return x;
 }
+#else
+template< int N, bool ROW_MAJOR, int WARP_SIZE, typename Value_type >
+static __device__ __forceinline__
+Value_type reduce_distributed_vectors( volatile Value_type *s_mem, Value_type x, int is_leader )
+{
+    if ( N & (N - 1) )
+    {
+#pragma unroll
+
+        for ( int i = 1 ; i < N ; ++i )
+        {
+            const int offset = ROW_MAJOR ? i : N * i;
+
+            if ( is_leader && utils::lane_id() < WARP_SIZE - offset )
+            {
+                s_mem[threadIdx.x] = x += s_mem[threadIdx.x + offset];
+            }
+        }
+    }
+    else
+    {
+#pragma unroll
+
+        for ( int i = 1 ; i < N ; i <<= 1 )
+        {
+            const int offset = ROW_MAJOR ? i : N * i;
+
+            if ( utils::lane_id() < WARP_SIZE - offset )
+            {
+                s_mem[threadIdx.x] = x += s_mem[threadIdx.x + offset];
+            }
+        }
+    }
+
+    return x;
+}
+#endif
 
 
 
 template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE,
           int WARP_SIZE, bool ROW_MAJOR, bool HAS_EXTERNAL_DIAG >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void b_minus_A_halo_x( const int *__restrict A_rows,
                        const int *__restrict A_cols,
@@ -204,8 +242,17 @@ void b_minus_A_halo_x( const int *__restrict A_rows,
     // Useful index to compute matrix products.
     const int lane_id_mod_NxN_div_N = lane_id_mod_NxN / N;
     const int lane_id_mod_NxN_mod_N = lane_id_mod_NxN % N;
+#if __CUDA_ARCH__ < 300
+    // Shared memory to broadcast column IDs.
+    __shared__ volatile int s_a_col_ids[CTA_SIZE];
+    __shared__ volatile int s_a_col_is_valid[CTA_SIZE];
+    // Each thread keeps its own pointer.
+    volatile int *my_s_a_col_ids = &s_a_col_ids[threadIdx.x - lane_id_mod_NxN];
+    volatile int *my_s_a_col_is_valid = &s_a_col_is_valid[threadIdx.x - lane_id_mod_NxN];
+#else
     // We to get my data from when I use SHFL.
     const int shfl_offset = lane_id - lane_id_mod_NxN;
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile Vector_type s_mem[CTA_SIZE];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -273,6 +320,10 @@ void b_minus_A_halo_x( const int *__restrict A_rows,
 
             // Determine if the column is halo column
             int a_col_is_valid = (a_col_id != -1) && (a_col_id >= num_owned_rows);
+#if __CUDA_ARCH__ < 300
+            my_s_a_col_ids[lane_id_mod_NxN] = a_col_id;
+            my_s_a_col_is_valid[lane_id_mod_NxN] = a_col_is_valid;
+#endif
             // Count the number of active columns.
             // int vote =  __ballot(aColId != -1);
             // The number of iterations.
@@ -283,8 +334,13 @@ void b_minus_A_halo_x( const int *__restrict A_rows,
             {
                 int my_k = k + lane_id_mod_NxN_div_N;
                 // Load N blocks of X.
+#if __CUDA_ARCH__ < 300
+                int uniform_a_col_id = my_s_a_col_ids[my_k];
+                int uniform_a_col_is_valid = my_s_a_col_is_valid[my_k];
+#else
                 int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k, WARP_SIZE, active_mask);
                 int uniform_a_col_is_valid = utils::shfl( a_col_is_valid, shfl_offset + my_k, WARP_SIZE, active_mask );
+#endif
                 Vector_type my_x(0);
 
                 if ( uniform_a_col_id != -1 && uniform_a_col_is_valid)
@@ -334,7 +390,12 @@ void b_minus_A_halo_x( const int *__restrict A_rows,
             is_leader = lane_id_mod_NxN_mod_N == 0;
         }
 
+#if __CUDA_ARCH__ >= 300
         my_bmAx = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( my_bmAx, is_leader, active_mask );
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+        my_bmAx = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( s_mem, my_bmAx, is_leader );
+#endif
 
         // Update the shared terms.
         if ( ROW_MAJOR )
@@ -363,7 +424,12 @@ void b_minus_A_halo_x( const int *__restrict A_rows,
         }
 
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
         my_bmAx = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( my_bmAx, is_leader, active_mask );
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+        my_bmAx = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( s_mem, my_bmAx, is_leader );
+#endif
 
         // Store the results.
         if ( ROW_MAJOR )
diff --git a/core/src/solvers/kaczmarz_solver.cu b/core/src/solvers/kaczmarz_solver.cu
index aad6ab89..f423a384 100644
--- a/core/src/solvers/kaczmarz_solver.cu
+++ b/core/src/solvers/kaczmarz_solver.cu
@@ -264,6 +264,9 @@ __global__ void kaczmarz_smooth_kernel_warp_atomics(const IndexType num_rows,
 {
     const int num_warps = kCtaSize / 32;
     const int num_rows_per_iter = num_warps * gridDim.x;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+    __shared__ volatile ValueTypeB smem[kCtaSize];
+#endif
     const int warpId = threadIdx.x / 32;
     const int laneId = threadIdx.x % 32;
 
@@ -279,7 +282,13 @@ __global__ void kaczmarz_smooth_kernel_warp_atomics(const IndexType num_rows,
         {
             ValueTypeB aValue = j < row_end ? Ax[j] : ValueTypeB(0);
             ValueTypeB xValue = j < row_end ? xout[Aj[j]] : ValueTypeB(0);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
             r = utils::warp_reduce<1, utils::Add>(aValue * xValue);
+#endif
+            //#else
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+            r = utils::warp_reduce<1, utils::Add>(smem, aValue * xValue);
+#endif
             Axi += r;
         }
 
@@ -312,6 +321,9 @@ __global__ void randomized_kaczmarz_smooth_kernel_warp_atomics(const IndexType n
 {
     const int num_warps = kCtaSize / 32;
     const int num_rows_per_iter = num_warps * gridDim.x;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+    __shared__ volatile ValueTypeB smem[kCtaSize];
+#endif
     const int warpId = threadIdx.x / 32;
     const int laneId = threadIdx.x % 32;
 
@@ -330,8 +342,15 @@ __global__ void randomized_kaczmarz_smooth_kernel_warp_atomics(const IndexType n
         {
             ValueTypeB aValue = j < row_end ? Ax[j] : ValueTypeB(0);
             ValueTypeB xValue = j < row_end ? xout[Aj[j]] : ValueTypeB(0);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
             r = utils::warp_reduce<1, utils::Add>(aValue * xValue);
             aa = utils::warp_reduce<1, utils::Add>(aValue * aValue);
+#endif
+            //#else
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+            r = utils::warp_reduce<1, utils::Add>(smem, aValue * xValue);
+            aa = utils::warp_reduce<1, utils::Add>(smem, aValue * aValue);
+#endif
             Axi += r;
             AA += aa;
         }
@@ -367,6 +386,9 @@ __global__ void kaczmarz_smooth_kernel(const IndexType num_rows,
     IndexType i, t;
     const int num_warps = kCtaSize / 32;
     const int num_rows_per_iter = num_warps * gridDim.x;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+    __shared__ volatile ValueTypeB smem[kCtaSize];
+#endif
     const int warpId = threadIdx.x / 32;
     const int laneId = threadIdx.x % 32;
 
@@ -383,7 +405,13 @@ __global__ void kaczmarz_smooth_kernel(const IndexType num_rows,
         {
             ValueTypeB aValue = j < row_end ? Ax[j] : ValueTypeB(0);
             ValueTypeB xValue = j < row_end ? xout[Aj[j]] : ValueTypeB(0);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
             r = utils::warp_reduce<1, utils::Add>(aValue * xValue);
+#endif
+            //#else
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+            r = utils::warp_reduce<1, utils::Add>(smem, aValue * xValue);
+#endif
             Axi += r;
             //Axi += utils::Warp_reduce_linear<1,32>::execute<utils::Add,ValueTypeB>(aValue * xValue);
             //Axi += Ax[j] * xout[Aj[j]];
@@ -456,6 +484,9 @@ __global__ void multicolor_kaczmarz_smooth_kernel(const IndexType num_rows,
 {
     const int num_warps = kCtaSize / 32;
     const int num_rows_per_iter = num_warps * gridDim.x;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+    __shared__ volatile ValueTypeB smem[kCtaSize];
+#endif
     const int warpId = threadIdx.x / 32;
     const int laneId = threadIdx.x % 32;
     int i;
@@ -473,7 +504,13 @@ __global__ void multicolor_kaczmarz_smooth_kernel(const IndexType num_rows,
         {
             ValueTypeB aValue = j < row_end ? Ax[j] : ValueTypeB(0);
             ValueTypeB xValue = j < row_end ? xout[Aj[j]] : ValueTypeB(0);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
             r = utils::warp_reduce<1, utils::Add>(aValue * xValue);
+            //#else
+#endif
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+            r = utils::warp_reduce<1, utils::Add>(smem, aValue * xValue);
+#endif
             Axi += r;
         }
 
diff --git a/core/src/solvers/multicolor_dilu_solver.cu b/core/src/solvers/multicolor_dilu_solver.cu
index fc741620..be69e2a4 100644
--- a/core/src/solvers/multicolor_dilu_solver.cu
+++ b/core/src/solvers/multicolor_dilu_solver.cu
@@ -61,10 +61,10 @@ enum { CTA_SIZE = 128, WARP_SIZE = 32 };
 
 template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE, int WARP_SIZE, int NUM_WARP_ITERS_PER_BLOCK >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_setup_NxN_kernel_large( const int *__restrict A_rows,
                                   const int *__restrict A_cols,
@@ -390,10 +390,10 @@ void DILU_setup_NxN_kernel_large( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE, int WARP_SIZE >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_setup_NxN_kernel( const int *__restrict A_rows,
                             const int *__restrict A_cols,
@@ -671,10 +671,10 @@ void DILU_setup_NxN_kernel( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, int NUM_THREADS_PER_ROW, int CTA_SIZE, int WARP_SIZE >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 16 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 16 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_setup_1x1_kernel( const int *__restrict A_rows,
                             const int *__restrict A_cols,
@@ -703,6 +703,10 @@ void DILU_setup_1x1_kernel( const int *__restrict A_rows,
     __shared__ volatile int s_A_ji[CTA_SIZE];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
     volatile int *my_s_A_ji = &s_A_ji[warp_id * WARP_SIZE];
+    // Shared memory to compute a reduction (not needed for >= SM30).
+#if __CUDA_ARCH__ < 300
+    __shared__ volatile Matrix_type s_mem[CTA_SIZE];
+#endif
     // Determine which NxN block the threads work with.
     int a_row_it = blockIdx.x * NUM_WARPS_PER_CTA + warp_id;
 
@@ -819,12 +823,26 @@ void DILU_setup_1x1_kernel( const int *__restrict A_rows,
         } // current_color != 0
 
         // Reduce the e_outs in one value.
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
+
         for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 )
         {
             e_out += utils::shfl_xor( e_out, mask );
         }
 
+#else
+        s_mem[threadIdx.x] = e_out;
+#pragma unroll
+
+        for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 )
+            if ( lane_id < offset )
+            {
+                s_mem[threadIdx.x] = e_out += s_mem[threadIdx.x + offset];
+            }
+
+#endif
+
         // Store the result.
         if ( lane_id == 0 )
         {
@@ -842,6 +860,7 @@ void DILU_setup_1x1_kernel( const int *__restrict A_rows,
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 template< int N, bool ROW_MAJOR, int WARP_SIZE, typename Value_type >
 static __device__ __forceinline__
 Value_type reduce_distributed_vectors( Value_type x, int is_leader )
@@ -872,15 +891,54 @@ Value_type reduce_distributed_vectors( Value_type x, int is_leader )
 
     return x;
 }
+//#endif
+//#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+#else
+template< int N, bool ROW_MAJOR, int WARP_SIZE, typename Value_type >
+static __device__ __forceinline__
+Value_type reduce_distributed_vectors( volatile Value_type *s_mem, Value_type x, int is_leader )
+{
+    if ( N & (N - 1) )
+    {
+#pragma unroll
+
+        for ( int i = 1 ; i < N ; ++i )
+        {
+            const int offset = ROW_MAJOR ? i : N * i;
+
+            if ( is_leader && utils::lane_id() < WARP_SIZE - offset )
+            {
+                s_mem[threadIdx.x] = x += s_mem[threadIdx.x + offset];
+            }
+        }
+    }
+    else
+    {
+#pragma unroll
+
+        for ( int i = 1 ; i < N ; i <<= 1 )
+        {
+            const int offset = ROW_MAJOR ? i : N * i;
+
+            if ( utils::lane_id() < WARP_SIZE - offset )
+            {
+                s_mem[threadIdx.x] = x += s_mem[threadIdx.x + offset];
+            }
+        }
+    }
+
+    return x;
+}
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE, int WARP_SIZE, bool ROW_MAJOR, bool HAS_EXTERNAL_DIAG >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_forward_NxN_kernel( const int *__restrict A_rows,
                               const int *__restrict A_cols,
@@ -915,8 +973,17 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows,
     // Useful index to compute matrix products.
     const int lane_id_mod_NxN_div_N = lane_id_mod_NxN / N;
     const int lane_id_mod_NxN_mod_N = lane_id_mod_NxN % N;
+#if __CUDA_ARCH__ < 300
+    // Shared memory to broadcast column IDs.
+    __shared__ volatile int s_a_col_ids[CTA_SIZE];
+    __shared__ volatile int s_a_col_is_valid[CTA_SIZE];
+    // Each thread keeps its own pointer.
+    volatile int *my_s_a_col_ids = &s_a_col_ids[threadIdx.x - lane_id_mod_NxN];
+    volatile int *my_s_a_col_is_valid = &s_a_col_is_valid[threadIdx.x - lane_id_mod_NxN];
+#else
     // We to get my data from when I use SHFL.
     const int shfl_offset = lane_id - lane_id_mod_NxN;
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile Vector_type s_mem[CTA_SIZE];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -993,6 +1060,7 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows,
             // Determine if the color is valid.
             int a_col_is_valid = false;
 #ifdef AMGX_ILU_COLORING
+
             if ( a_col_id != -1 && current_color != 0 )
             {
                 if ( boundary_coloring == FIRST )
@@ -1012,6 +1080,10 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows,
                 a_col_is_valid = row_colors[a_col_id] < current_color;
             }
 
+#endif
+#if __CUDA_ARCH__ < 300
+            my_s_a_col_ids[lane_id_mod_NxN] = a_col_id;
+            my_s_a_col_is_valid[lane_id_mod_NxN] = a_col_is_valid;
 #endif
             // Count the number of active columns.
             // int vote =  utils::ballot(aColId != -1);
@@ -1023,8 +1095,13 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows,
             {
                 int my_k = k + lane_id_mod_NxN_div_N;
                 // Load N blocks of X.
+#if __CUDA_ARCH__ < 300
+                int uniform_a_col_id = my_s_a_col_ids[my_k];
+                int uniform_a_col_is_valid = my_s_a_col_is_valid[my_k];
+#else
                 int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k );
                 int uniform_a_col_is_valid = utils::shfl( a_col_is_valid, shfl_offset + my_k );
+#endif
                 Vector_type my_x(0);
 
                 if ( uniform_a_col_id != -1 )
@@ -1084,7 +1161,12 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows,
             is_leader = lane_id_mod_NxN_mod_N == 0;
         }
 
+#if __CUDA_ARCH__ >= 300
         my_bmAx = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( my_bmAx, is_leader );
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+        my_bmAx = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( s_mem, my_bmAx, is_leader );
+#endif
 
         // Update the shared terms.
         if ( ROW_MAJOR )
@@ -1113,7 +1195,12 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows,
         }
 
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
         my_bmAx = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( my_bmAx, is_leader );
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+        my_bmAx = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( s_mem, my_bmAx, is_leader );
+#endif
 
         // Store the results.
         if ( ROW_MAJOR )
@@ -1135,10 +1222,10 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE, int WARP_SIZE, bool HAS_EXTERNAL_DIAG, int NUM_WARP_ITERS_PER_BLOCK >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_forward_NxN_kernel_large( const int *__restrict A_rows,
                                     const int *__restrict A_cols,
@@ -1382,10 +1469,10 @@ void DILU_forward_NxN_kernel_large( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, int CTA_SIZE, int WARP_SIZE, bool ROW_MAJOR, bool HAS_EXTERNAL_DIAG >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_forward_4x4_kernel( const int *__restrict A_rows,
                               const int *__restrict A_cols,
@@ -1417,8 +1504,17 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows,
     // Useful index to compute matrix products.
     const int lane_id_mod_16_div_4 = lane_id_mod_16 / 4;
     const int lane_id_mod_16_mod_4 = lane_id_mod_16 % 4;
+#if __CUDA_ARCH__ < 300
+    // Shared memory to broadcast column IDs.
+    __shared__ volatile int s_a_col_ids[CTA_SIZE];
+    __shared__ volatile int s_a_col_is_valid[CTA_SIZE];
+    // Each thread keeps its own pointer.
+    volatile int *my_s_a_col_ids = &s_a_col_ids[threadIdx.x - lane_id_mod_16];
+    volatile int *my_s_a_col_is_valid = &s_a_col_is_valid[threadIdx.x - lane_id_mod_16];
+#else
     // We to get my data from when I use SHFL.
     const int shfl_offset = lane_id - lane_id_mod_16;
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile Vector_type s_mem[CTA_SIZE];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -1500,6 +1596,10 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows,
                 a_col_is_valid = row_colors[a_col_id] < current_color;
             }
 
+#endif
+#if __CUDA_ARCH__ < 300
+            my_s_a_col_ids[lane_id_mod_16] = a_col_id;
+            my_s_a_col_is_valid[lane_id_mod_16] = a_col_is_valid;
 #endif
             // Count the number of active columns.
             // int vote =  utils::ballot(aColId != -1);
@@ -1511,8 +1611,13 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows,
             {
                 int my_k = k + lane_id_mod_16_div_4;
                 // Load N blocks of X.
+#if __CUDA_ARCH__ < 300
+                int uniform_a_col_id = my_s_a_col_ids[my_k];
+                int uniform_a_col_is_valid = my_s_a_col_is_valid[my_k];
+#else
                 int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k );
                 int uniform_a_col_is_valid = utils::shfl( a_col_is_valid, shfl_offset + my_k );
+#endif
                 Vector_type my_x(0);
 
                 if ( uniform_a_col_id != -1 )
@@ -1572,7 +1677,12 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows,
             is_leader = lane_id_mod_16_mod_4 == 0;
         }
 
+#if __CUDA_ARCH__ >= 300
         my_bmAx = reduce_distributed_vectors<4, ROW_MAJOR, WARP_SIZE>( my_bmAx, is_leader );
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+        my_bmAx = reduce_distributed_vectors<4, ROW_MAJOR, WARP_SIZE>( s_mem, my_bmAx, is_leader );
+#endif
 
         // Update the shared terms.
         if ( ROW_MAJOR )
@@ -1601,7 +1711,12 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows,
         }
 
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
         my_bmAx = reduce_distributed_vectors<4, ROW_MAJOR, WARP_SIZE>( my_bmAx, is_leader );
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+        my_bmAx = reduce_distributed_vectors<4, ROW_MAJOR, WARP_SIZE>( s_mem, my_bmAx, is_leader );
+#endif
 
         // Store the results.
         if ( ROW_MAJOR )
@@ -1625,10 +1740,10 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, int CTA_SIZE, bool HAS_EXTERNAL_DIAG >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
         const int *__restrict A_cols,
@@ -1656,7 +1771,16 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
     // Useful constants.
     const int thread_id_mod_16_div_4 = thread_id_mod_16 / 4;
     const int thread_id_mod_16_mod_4 = thread_id_mod_16 % 4;
+#if __CUDA_ARCH__ < 300
+    // Shared memory to broadcast column IDs.
+    __shared__ volatile int s_a_col_ids[CTA_SIZE];
+    __shared__ volatile int s_a_col_is_valid[CTA_SIZE];
+    // Each thread keeps its own pointer.
+    volatile int *my_s_a_col_ids = &s_a_col_ids[16 * thread_id_div_16];
+    volatile int *my_s_a_col_is_valid = &s_a_col_is_valid[16 * thread_id_div_16];
+#else
     const int shfl_offset = 16 * (lane_id / 16);
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile Vector_type s_mem[CTA_SIZE];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -1727,6 +1851,10 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
                 a_col_is_valid = row_colors[a_col_id] < current_color;
             }
 
+#endif
+#if __CUDA_ARCH__ < 300
+            my_s_a_col_ids[thread_id_mod_16] = a_col_id;
+            my_s_a_col_is_valid[thread_id_mod_16] = a_col_is_valid;
 #endif
 
             // Loop over columns. We compute 8 columns per iteration.
@@ -1734,8 +1862,13 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
             {
                 int my_k = k + thread_id_mod_16_div_4;
                 // Load 8 blocks of X.
+#if __CUDA_ARCH__ < 300
+                int uniform_a_col_id = my_s_a_col_ids[my_k];
+                int uniform_a_col_is_valid = my_s_a_col_is_valid[my_k];
+#else
                 int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k );
                 int uniform_a_col_is_valid = utils::shfl( a_col_is_valid, shfl_offset + my_k );
+#endif
                 Vector_type my_x(0);
 
                 if ( uniform_a_col_id != -1 )
@@ -1778,8 +1911,17 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
         // Load Einvs.
         Matrix_type my_Einv = Einv[16 * a_row_id + thread_id_mod_16];
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
         my_bmAx += utils::shfl_xor( my_bmAx, 4 );
         my_bmAx += utils::shfl_xor( my_bmAx, 8 );
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+
+        if ( lane_id < 28 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 4]; }
+
+        if ( lane_id < 24 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 8]; }
+
+#endif
 
         // Update the shared terms.
         if ( thread_id_mod_16_div_4 == 0 )
@@ -1790,8 +1932,17 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
         // Update the diagonal term.
         my_bmAx = my_Einv * my_s_mem[thread_id_mod_16_mod_4];
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
         my_bmAx += utils::shfl_xor( my_bmAx, 1 );
         my_bmAx += utils::shfl_xor( my_bmAx, 2 );
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+
+        if ( lane_id < 31 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 1]; }
+
+        if ( lane_id < 30 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 2]; }
+
+#endif
 
         // Store the results.
         if ( thread_id_mod_16_mod_4 == 0 )
@@ -1805,10 +1956,10 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, int NUM_THREADS_PER_ROW, int CTA_SIZE, int WARP_SIZE, bool HAS_EXTERNAL_DIAG >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_forward_1x1_kernel( const int *__restrict A_rows,
                               const int *__restrict A_cols,
@@ -1834,6 +1985,10 @@ void DILU_forward_1x1_kernel( const int *__restrict A_rows,
     const int lane_id = utils::lane_id();
     // Constants.
     const int lane_id_mod_NTPR = lane_id % NUM_THREADS_PER_ROW;
+#if __CUDA_ARCH__ < 300
+    // Shared memory needed to exchange X and delta.
+    __shared__ volatile Vector_type s_mem[CTA_SIZE];
+#endif
     // Determine which NxN block the threads work with.
     int a_row_it = blockIdx.x * NUM_ROWS_PER_CTA + (threadIdx.x / NUM_THREADS_PER_ROW);
 
@@ -1932,6 +2087,7 @@ void DILU_forward_1x1_kernel( const int *__restrict A_rows,
         }
 
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
 
         for ( int mask = NUM_THREADS_PER_ROW / 2 ; mask > 0 ; mask >>= 1 )
@@ -1939,6 +2095,18 @@ void DILU_forward_1x1_kernel( const int *__restrict A_rows,
             my_bmAx += utils::shfl_xor( my_bmAx, mask );
         }
 
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+#pragma unroll
+
+        for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 )
+            if ( lane_id_mod_NTPR < offset )
+            {
+                s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + offset];
+            }
+
+#endif
+
         // Store the results.
         if ( lane_id_mod_NTPR == 0 )
         {
@@ -1951,10 +2119,10 @@ void DILU_forward_1x1_kernel( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, typename WeightType, int N, int CTA_SIZE, int WARP_SIZE, bool ROW_MAJOR >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_backward_NxN_kernel( const int *__restrict A_rows,
                                const int *__restrict A_cols,
@@ -1989,8 +2157,15 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows,
     // Useful index to compute matrix products.
     const int lane_id_mod_NxN_div_N = lane_id_mod_NxN / N;
     const int lane_id_mod_NxN_mod_N = lane_id_mod_NxN % N;
+#if __CUDA_ARCH__ < 300
+    // Shared memory to broadcast column IDs.
+    __shared__ volatile int s_a_col_ids[CTA_SIZE];
+    // Each thread keeps its own pointer.
+    volatile int *my_s_a_col_ids = &s_a_col_ids[threadIdx.x - lane_id_mod_NxN];
+#else
     // We to get my data from when I use SHFL.
     const int shfl_offset = lane_id - lane_id_mod_NxN;
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile Vector_type s_mem[CTA_SIZE];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -2068,6 +2243,10 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows,
                 a_col_id = a_col_tmp;
             }
 
+#if __CUDA_ARCH__ < 300
+            my_s_a_col_ids[lane_id_mod_NxN] = a_col_id;
+#endif
+
             // Count the number of active columns.
             // int vote =  utils::ballot(aColId != -1);
             // The number of iterations.
@@ -2078,7 +2257,11 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows,
             {
                 int my_k = k + lane_id_mod_NxN_div_N;
                 // Load N blocks of X.
+#if __CUDA_ARCH__ < 300
+                int uniform_a_col_id = my_s_a_col_ids[my_k];
+#else
                 int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k );
+#endif
                 Vector_type my_x(0);
 
                 if ( uniform_a_col_id != -1 )
@@ -2130,7 +2313,12 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows,
             is_leader = lane_id_mod_NxN_mod_N == 0;
         }
 
+#if __CUDA_ARCH__ >= 300
         my_delta = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( my_delta, is_leader );
+#else
+        s_mem[threadIdx.x] = my_delta;
+        my_delta = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( s_mem, my_delta, is_leader );
+#endif
 
         // Update the shared terms.
         if ( ROW_MAJOR )
@@ -2159,7 +2347,12 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows,
         }
 
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
         my_delta = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( my_delta, is_leader );
+#else
+        s_mem[threadIdx.x] = my_delta;
+        my_delta = reduce_distributed_vectors<N, ROW_MAJOR, WARP_SIZE>( s_mem, my_delta, is_leader );
+#endif
 
         // Store the results.
         if ( ROW_MAJOR )
@@ -2209,10 +2402,10 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, typename WeightType, int N, int CTA_SIZE, int WARP_SIZE, bool ROW_MAJOR, int NUM_WARP_ITERS_PER_BLOCK >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_backward_NxN_kernel_large( const int *__restrict A_rows,
                                      const int *__restrict A_cols,
@@ -2451,10 +2644,10 @@ void DILU_backward_NxN_kernel_large( const int *__restrict A_rows,
 
 template< typename IndexType, typename ValueTypeA, typename ValueTypeB, typename WeightType, int CTA_SIZE, bool ROW_MAJOR >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 16 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 16 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_backward_4x4_kernel( const IndexType *row_offsets,
                                const IndexType *column_indices,
@@ -2477,7 +2670,14 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets,
     const int halfLaneId = threadIdx.x % 16;
     const int halfLaneId_div_4 = halfLaneId / 4;
     const int halfLaneId_mod_4 = halfLaneId % 4;
+#if __CUDA_ARCH__ < 300
+    // Shared memory to broadcast column IDs.
+    __shared__ volatile int s_aColIds[CTA_SIZE];
+    // Each thread keeps its own pointer.
+    volatile int *my_s_aColIds = &s_aColIds[16 * halfWarpId];
+#else
     const int upperHalf = 16 * (laneId / 16);
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile ValueTypeB s_mem[CTA_SIZE];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -2520,12 +2720,19 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets,
                 aColId = aColTmp;
             }
 
+#endif
+#if __CUDA_ARCH__ < 300
+            my_s_aColIds[halfLaneId] = aColId;
 #endif
             for ( int k = 0 ; k < 16 ; k += 4 )
             {
                 int my_k = k + halfLaneId_div_4;
                 // Exchange column indices.
+#if __CUDA_ARCH__ < 300
+                int waColId = my_s_aColIds[my_k];
+#else
                 int waColId = utils::shfl( aColId, upperHalf + my_k );
+#endif
                 // Load 8 blocks of X if needed.
                 ValueTypeB my_x(0);
 
@@ -2542,7 +2749,12 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets,
                 {
                     const int k_i = k + i;
                     int w_aColTmp = aColBegin + k_i, w_aColIt = -1;
+#if __CUDA_ARCH__ < 300
+
+                    if ( my_s_aColIds[k_i] != -1 && w_aColTmp < aColEnd )
+#else
                     if ( utils::shfl( aColId, upperHalf + k_i ) != -1 && w_aColTmp < aColEnd )
+#endif
                         w_aColIt = w_aColTmp;
 
                     ValueTypeA my_val(0);
@@ -2567,6 +2779,8 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets,
         // Load EINV values.
         ValueTypeA my_Einv = Einv[16 * aRowId + halfLaneId];
         // Reduce delta terms.
+#if __CUDA_ARCH__ >= 300
+
         if ( ROW_MAJOR )
         {
             my_delta += utils::shfl_xor( my_delta, 1 );
@@ -2578,6 +2792,24 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets,
             my_delta += utils::shfl_xor( my_delta, 8 );
         }
 
+#else
+        s_mem[threadIdx.x] = my_delta;
+
+        if ( ROW_MAJOR )
+        {
+            if ( laneId < 31 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 1]; }
+
+            if ( laneId < 30 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 2]; }
+        }
+        else
+        {
+            if ( laneId < 28 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 4]; }
+
+            if ( laneId < 24 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 8]; }
+        }
+
+#endif
+
         // Update the shared terms.
         if ( ROW_MAJOR )
         {
@@ -2605,6 +2837,8 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets,
         }
 
         // Regroup results.
+#if __CUDA_ARCH__ >= 300
+
         if ( ROW_MAJOR )
         {
             my_delta += utils::shfl_xor( my_delta, 1 );
@@ -2616,6 +2850,24 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets,
             my_delta += utils::shfl_xor( my_delta, 8 );
         }
 
+#else
+        s_mem[threadIdx.x] = my_delta;
+
+        if ( ROW_MAJOR )
+        {
+            if ( laneId < 31 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 1]; }
+
+            if ( laneId < 30 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 2]; }
+        }
+        else
+        {
+            if ( laneId < 28 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 4]; }
+
+            if ( laneId < 24 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 8]; }
+        }
+
+#endif
+
         // Store the results.
         if ( ROW_MAJOR )
         {
@@ -2662,10 +2914,10 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets,
 
 template< typename Matrix_type, typename Vector_type, typename WeightType, int CTA_SIZE >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 16 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 16 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
         const int *__restrict A_cols,
@@ -2693,7 +2945,14 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
     // Useful constants.
     const int thread_id_mod_16_div_4 = thread_id_mod_16 / 4;
     const int thread_id_mod_16_mod_4 = thread_id_mod_16 % 4;
+#if __CUDA_ARCH__ < 300
+    // Shared memory to broadcast column IDs.
+    __shared__ volatile int s_a_col_ids[CTA_SIZE];
+    // Each thread keeps its own pointer.
+    volatile int *my_s_a_col_ids = &s_a_col_ids[16 * thread_id_div_16];
+#else
     const int shfl_offset = 16 * (lane_id / 16);
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile Vector_type s_mem[CTA_SIZE];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -2754,6 +3013,9 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
                 a_col_id = -1;
             }
 
+#if __CUDA_ARCH__ < 300
+            my_s_a_col_ids[thread_id_mod_16] = a_col_id;
+#endif
             // Loop over columns. We compute 8 columns per iteration.
 #pragma unroll 2
 
@@ -2761,7 +3023,11 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
             {
                 int my_k = k + thread_id_mod_16_div_4;
                 // Load 8 blocks of X.
+#if __CUDA_ARCH__ < 300
+                int uniform_a_col_id = my_s_a_col_ids[my_k];
+#else
                 int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k );
+#endif
                 Vector_type my_Delta(0);
 
                 if ( uniform_a_col_id != -1 )
@@ -2794,8 +3060,17 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
         // Load EINV values.
         Matrix_type my_Einv = Einv[16 * a_row_id + thread_id_mod_16];
         // Reduce delta terms.
+#if __CUDA_ARCH__ >= 300
         my_delta += utils::shfl_xor( my_delta, 4 );
         my_delta += utils::shfl_xor( my_delta, 8 );
+#else
+        s_mem[threadIdx.x] = my_delta;
+
+        if ( lane_id < 28 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 4]; }
+
+        if ( lane_id < 24 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 8]; }
+
+#endif
 
         // Update the shared terms.
         if ( thread_id_mod_16_div_4 == 0 )
@@ -2806,8 +3081,17 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
         // Update the diagonal term.
         my_delta = my_Einv * my_s_mem[thread_id_mod_16_mod_4];
         // Regroup results.
+#if __CUDA_ARCH__ >= 300
         my_delta += utils::shfl_xor( my_delta, 1 );
         my_delta += utils::shfl_xor( my_delta, 2 );
+#else
+        s_mem[threadIdx.x] = my_delta;
+
+        if ( lane_id < 31 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 1]; }
+
+        if ( lane_id < 30 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 2]; }
+
+#endif
         // Store the results.
         int offset = 4 * a_row_id + thread_id_mod_16_div_4;
         Vector_type my_b(0), my_x(0);
@@ -2832,10 +3116,10 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, typename WeightType, int NUM_THREADS_PER_ROW, int CTA_SIZE, int WARP_SIZE >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 12 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_backward_1x1_kernel( const int *__restrict A_rows,
                                const int *__restrict A_cols,
@@ -2861,6 +3145,10 @@ void DILU_backward_1x1_kernel( const int *__restrict A_rows,
     const int lane_id = utils::lane_id();
     // Constants.
     const int lane_id_mod_NTPR = lane_id % NUM_THREADS_PER_ROW;
+#if __CUDA_ARCH__ < 300
+    // Shared memory needed to exchange X and delta.
+    __shared__ volatile Vector_type s_mem[CTA_SIZE];
+#endif
     // Determine which NxN block the threads work with.
     int a_row_it = blockIdx.x * NUM_ROWS_PER_CTA + (threadIdx.x / NUM_THREADS_PER_ROW);
 
@@ -2928,6 +3216,7 @@ void DILU_backward_1x1_kernel( const int *__restrict A_rows,
         }
 
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
 #pragma unroll
 
         for ( int mask = NUM_THREADS_PER_ROW / 2 ; mask > 0 ; mask >>= 1 )
@@ -2935,6 +3224,18 @@ void DILU_backward_1x1_kernel( const int *__restrict A_rows,
             my_delta += utils::shfl_xor( my_delta, mask );
         }
 
+#else
+        s_mem[threadIdx.x] = my_delta;
+#pragma unroll
+
+        for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 )
+            if ( lane_id_mod_NTPR < offset )
+            {
+                s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + offset];
+            }
+
+#endif
+
         // Store the results.
         if ( lane_id_mod_NTPR == 0 )
         {
@@ -2949,10 +3250,10 @@ void DILU_backward_1x1_kernel( const int *__restrict A_rows,
 
 template< typename Matrix_type, typename Vector_type, typename WeightType, int N, int CTA_SIZE >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CTA_SIZE, 16 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CTA_SIZE, 16 )
+__launch_bounds__( CTA_SIZE, 8 )
 #endif
 void DILU_backward_NxN_kernel_skip( Vector_type *__restrict x,
                                     const WeightType weight,
diff --git a/core/src/solvers/multicolor_ilu_solver.cu b/core/src/solvers/multicolor_ilu_solver.cu
index 35f00a68..a13e6f62 100644
--- a/core/src/solvers/multicolor_ilu_solver.cu
+++ b/core/src/solvers/multicolor_ilu_solver.cu
@@ -69,10 +69,10 @@ namespace multicolor_ilu_solver
 
 template<typename IndexType, typename ValueTypeA, typename ValueTypeB, int CtaSize, int bsize, bool ROW_MAJOR, bool hasDiag>
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CtaSize, 16 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CtaSize, 16 )
+__launch_bounds__( CtaSize, 8 )
 #endif
 void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets,
                                  const IndexType *LU_smaller_color_offsets,
@@ -97,7 +97,14 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets,
     const int halfLaneId = threadIdx.x % 16;
     const int halfLaneId_div_4 = halfLaneId / 4;
     const int halfLaneId_mod_4 = halfLaneId % 4;
+#if __CUDA_ARCH__ < 300
+    // Shared memory to broadcast column IDs.
+    __shared__ volatile int s_aColIds[CtaSize];
+    // Each thread keeps its own pointer.
+    volatile int *my_s_aColIds = &s_aColIds[16 * halfWarpId];
+#else
     const int upperHalf = 16 * (laneId / 16);
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile ValueTypeB s_mem[CtaSize];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -156,6 +163,9 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets,
                     aColId = aRowId;
                 }
 
+#if __CUDA_ARCH__ < 300
+                my_s_aColIds[halfLaneId] = aColId;
+#endif
                 // Count the number of active columns.
                 int vote =  utils::ballot(aColId != -1, active_mask);
                 // The number of iterations.
@@ -166,7 +176,11 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets,
                 {
                     int my_k = k + halfLaneId_div_4;
                     // Load 8 blocks of X.
+#if __CUDA_ARCH__ < 300
+                    int waColId = my_s_aColIds[my_k];
+#else
                     int waColId = utils::shfl( aColId, upperHalf + my_k, warpSize, active_mask );
+#endif
                     ValueTypeB my_x(0);
 
                     if ( waColId != -1 )
@@ -229,6 +243,10 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets,
                     aColId = LU_column_indices[aColIt];
                 }
 
+#if __CUDA_ARCH__ < 300
+                my_s_aColIds[halfLaneId] = aColId;
+#endif
+
                 // Count the number of active columns.
                 int vote =  utils::ballot(aColId != -1, active_mask);
                 // The number of iterations.
@@ -238,7 +256,11 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets,
                 {
                     int my_k = k + halfLaneId_div_4;
                     // Load 8 blocks of X.
+#if __CUDA_ARCH__ < 300
+                    int waColId = my_s_aColIds[my_k];
+#else
                     int waColId = utils::shfl( aColId, upperHalf + my_k, warpSize, active_mask );
+#endif
                     ValueTypeB my_delta(0);
 
                     if ( waColId != -1 )
@@ -280,6 +302,8 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets,
         } // If current_color != 0
 
         // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
+
         if ( ROW_MAJOR )
         {
             my_bmAx += utils::shfl_xor( my_bmAx, 1, warpSize, active_mask );
@@ -291,6 +315,24 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets,
             my_bmAx += utils::shfl_xor( my_bmAx, 8, warpSize, active_mask );
         }
 
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+
+        if ( ROW_MAJOR )
+        {
+            if ( laneId < 31 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 1]; }
+
+            if ( laneId < 30 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 2]; }
+        }
+        else
+        {
+            if ( laneId < 28 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 4]; }
+
+            if ( laneId < 24 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 8]; }
+        }
+
+#endif
+
         // Store the results.
         if ( ROW_MAJOR )
         {
@@ -427,10 +469,10 @@ void LU_forward_4x4_kernel(const IndexType *LU_row_offsets, const IndexType *LU_
 
 template< typename IndexType, typename ValueTypeA, typename ValueTypeB, int CtaSize, bool ROW_MAJOR >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CtaSize, 16 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CtaSize, 16 )
+__launch_bounds__( CtaSize, 8 )
 #endif
 void LU_backward_4x4_kernel_warp( const IndexType *row_offsets,
                                   const IndexType *larger_color_offsets,
@@ -454,7 +496,14 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets,
     const int halfLaneId = threadIdx.x % 16;
     const int halfLaneId_div_4 = halfLaneId / 4;
     const int halfLaneId_mod_4 = halfLaneId % 4;
+#if __CUDA_ARCH__ < 300
+    // Shared memory to broadcast column IDs.
+    __shared__ volatile int s_aColIds[CtaSize];
+    // Each thread keeps its own pointer.
+    volatile int *my_s_aColIds = &s_aColIds[16 * halfWarpId];
+#else
     const int upperHalf = 16 * (laneId / 16);
+#endif
     // Shared memory needed to exchange X and delta.
     __shared__ volatile ValueTypeB s_mem[CtaSize];
     // Each thread keeps its own pointer to shared memory to avoid some extra computations.
@@ -501,12 +550,20 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets,
                     aColId = column_indices[aColIt];
                 }
 
+#if __CUDA_ARCH__ < 300
+                my_s_aColIds[halfLaneId] = aColId;
+#endif
+
                 // Loop over columns. We compute 8 columns per iteration.
                 for ( int k = 0 ; k < 16 ; k += 4 )
                 {
                     int my_k = k + halfLaneId_div_4;
                     // Exchange column indices.
+#if __CUDA_ARCH__ < 300
+                    int waColId = my_s_aColIds[my_k];
+#else
                     int waColId = utils::shfl( aColId, upperHalf + my_k, warpSize, active_mask );
+#endif
                     // Load 8 blocks of X if needed.
                     ValueTypeB *my_ptr = Delta;
 
@@ -555,6 +612,8 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets,
             } // Loop over aColIt
 
             // Reduce bmAx terms.
+#if __CUDA_ARCH__ >= 300
+
             if ( ROW_MAJOR )
             {
                 my_bmAx += utils::shfl_xor( my_bmAx, 1, warpSize, active_mask );
@@ -565,6 +624,24 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets,
                 my_bmAx += utils::shfl_xor( my_bmAx, 4, warpSize, active_mask );
                 my_bmAx += utils::shfl_xor( my_bmAx, 8, warpSize, active_mask );
             }
+
+#else
+            s_mem[threadIdx.x] = my_bmAx;
+
+            if ( ROW_MAJOR )
+            {
+                if ( laneId < 31 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 1]; }
+
+                if ( laneId < 30 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 2]; }
+            }
+            else
+            {
+                if ( laneId < 28 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 4]; }
+
+                if ( laneId < 24 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 8]; }
+            }
+
+#endif
         } // if current_color != num_colors-1
 
         // Update the shared terms.
@@ -603,6 +680,8 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets,
         }
 
         // Regroup results.
+#if __CUDA_ARCH__ >= 300
+
         if ( ROW_MAJOR )
         {
             my_bmAx += utils::shfl_xor( my_bmAx, 1 );
@@ -614,6 +693,24 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets,
             my_bmAx += utils::shfl_xor( my_bmAx, 8 );
         }
 
+#else
+        s_mem[threadIdx.x] = my_bmAx;
+
+        if ( ROW_MAJOR )
+        {
+            if ( laneId < 31 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 1]; }
+
+            if ( laneId < 30 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 2]; }
+        }
+        else
+        {
+            if ( laneId < 28 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 4]; }
+
+            if ( laneId < 24 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 8]; }
+        }
+
+#endif
+
         // Store the results.
         if ( ROW_MAJOR )
         {
@@ -989,10 +1086,10 @@ computeAtoLUmappingExtDiag_kernel( int A_nRows,
 
 template< typename ValueTypeA, int CtaSize, int SMemSize, bool ROW_MAJOR >
 __global__
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 __launch_bounds__( CtaSize, 12 )
 #elif defined(__CUDA_ARCH__)
-__launch_bounds__( CtaSize, 12 )
+__launch_bounds__( CtaSize, 8 )
 #endif
 void
 compute_LU_factors_4x4_kernel_warp( int A_nRows,
@@ -1021,7 +1118,12 @@ compute_LU_factors_4x4_kernel_warp( int A_nRows,
     __shared__ volatile ValueTypeA s_C_mtx[nWarps][32];
     __shared__ volatile ValueTypeA s_F_mtx[nWarps][16];
     // Shared memory to store the proposed column to load
+#if __CUDA_ARCH__ < 300
+    __shared__ volatile int s_aColItToLoad [nWarps][32];
+    __shared__ volatile int s_waColItToLoad[nWarps][32];
+#else
     __shared__ volatile int s_aColSrc[nWarps][32];
+#endif
     // Shared memory to store the column indices of the current row
     __shared__ volatile int s_keys[nWarps][SMemSize];
 
@@ -1143,7 +1245,12 @@ compute_LU_factors_4x4_kernel_warp( int A_nRows,
 
                     if (pred)
                     {
+#if __CUDA_ARCH__ < 300
+                        s_aColItToLoad [warpId][idst] = found_aColIt;
+                        s_waColItToLoad[warpId][idst] = waColIt;
+#else
                         s_aColSrc[warpId][idst] = laneId;
+#endif
                     }
                     utils::syncwarp(active_mask);
 
@@ -1156,6 +1263,13 @@ compute_LU_factors_4x4_kernel_warp( int A_nRows,
                         // Where to get columns from.
                         int a_col_it = -1, w_col_it = -1;
                         // Load column to load
+#if __CUDA_ARCH__ < 300
+                        if ( my_k < n_cols )
+                        {
+                            a_col_it = s_aColItToLoad [warpId][my_k];
+                            w_col_it = s_waColItToLoad[warpId][my_k];
+                        }
+#else
                         a_col_it = utils::shfl(found_aColIt, s_aColSrc[warpId][my_k], warpSize, active_mask);
                         w_col_it = utils::shfl(waColIt,      s_aColSrc[warpId][my_k], warpSize, active_mask);
 
@@ -1165,6 +1279,7 @@ compute_LU_factors_4x4_kernel_warp( int A_nRows,
                             w_col_it = -1;
                         }
 
+#endif
                         ValueTypeA my_C(0);
 
                         if ( w_col_it != -1 )
diff --git a/examples/amgx_spmv_example/Makefile b/examples/amgx_spmv_example/Makefile
index db08516e..cb0ae6da 100644
--- a/examples/amgx_spmv_example/Makefile
+++ b/examples/amgx_spmv_example/Makefile
@@ -16,7 +16,7 @@ EXTRA_LIBS = -lcublas -lcusparse -lcusolver -Xlinker=-rpath=$(CUDA_PATH)/lib64
 AMGX_ROOT = ../..
 AMGX_INCLUDE = -I$(AMGX_ROOT)/../../thrust -I$(AMGX_ROOT)/base/include -I$(CUDA_PATH)/include -I$(AMGX_ROOT)/external/rapidjson/include -I$(MPI_PATH)/include
 
-BASE_CU_FILES := ../../base/src/misc.cu ../../base/src/device_properties.cu ../../base/src/logger.cu ../../base/src/auxdata.cu ../../base/src/amgx_cusparse.cu ../../base/src/amgx_cublas.cu ../../base/src/amg_config.cu ../../base/src/global_thread_handle.cu ../../base/src/error.cu ../../base/src/thread_manager.cu ../../base/src/resources.cu ../../base/src/matrix.cu ../../base/src/multiply.cu ../../base/src/hash_workspace.cu ../../base/src/csr_multiply.cu ../../base/src/csr_multiply_sm70.cu ../../base/src/csr_multiply_sm35.cu ../../base/src/matrix_coloring/matrix_coloring.cu ../../base/src/distributed/distributed_manager.cu ../../base/src/distributed/distributed_arranger.cu ../../base/src/distributed/comms_visitors1.cu ../../base/src/distributed/comms_visitors2.cu ../../base/src/distributed/comms_visitors3.cu ../../base/src/distributed/comms_visitors4.cu ../../base/src/distributed/comms_mpi_hostbuffer_stream.cu ../../base/src/distributed/comms_mpi_gpudirect.cu ../../base/src/distributed/distributed_comms.cu
+BASE_CU_FILES := ../../base/src/misc.cu ../../base/src/device_properties.cu ../../base/src/logger.cu ../../base/src/auxdata.cu ../../base/src/amgx_cusparse.cu ../../base/src/amgx_cublas.cu ../../base/src/amg_config.cu ../../base/src/global_thread_handle.cu ../../base/src/error.cu ../../base/src/thread_manager.cu ../../base/src/resources.cu ../../base/src/matrix.cu ../../base/src/multiply.cu ../../base/src/hash_workspace.cu ../../base/src/csr_multiply.cu ../../base/src/csr_multiply_sm20.cu ../../base/src/csr_multiply_sm35.cu ../../base/src/matrix_coloring/matrix_coloring.cu ../../base/src/distributed/distributed_manager.cu ../../base/src/distributed/distributed_arranger.cu ../../base/src/distributed/comms_visitors1.cu ../../base/src/distributed/comms_visitors2.cu ../../base/src/distributed/comms_visitors3.cu ../../base/src/distributed/comms_visitors4.cu ../../base/src/distributed/comms_mpi_hostbuffer_stream.cu ../../base/src/distributed/comms_mpi_gpudirect.cu ../../base/src/distributed/distributed_comms.cu
 
 OBJDIR_S_GPU := obj_s
 OBJDIR_M_GPU := obj_m