From d0019e5d32e99e7d679b3b773cf16b6f8e7da6f9 Mon Sep 17 00:00:00 2001 From: marsaev <32493426+marsaev@users.noreply.github.com> Date: Wed, 8 Apr 2020 00:11:09 +0300 Subject: [PATCH] Revert "Removing references to Fermi, code specific to Fermi, and adding duplicates for Volta ready for tuning." This reverts commit cb986b36dfdd1f23f01ddb7ee793ce2dc205c52d. --- base/include/csr_multiply_sm35.h | 2 +- base/include/csr_multiply_sm70.h | 82 - base/include/hash_containers_sm70.inl | 1430 ----------- base/include/ld_functions.h | 8 + base/include/memory_intrinsics.h | 32 + base/include/sm_utils.inl | 168 ++ base/include/strided_reduction.h | 37 +- base/include/texture.h | 4 + base/src/classical/selectors/selector.cu | 88 +- base/src/csr_multiply.cu | 24 +- base/src/csr_multiply_sm35.cu | 29 +- base/src/csr_multiply_sm70.cu | 2124 ----------------- base/src/multiply.cu | 52 +- core/include/matrix_coloring/bfs.h | 29 + core/include/matrix_coloring/coloring_utils.h | 33 + .../low_deg_coarse_A_generator.cu | 267 ++- .../selectors/parallel_greedy_selector.cu | 6 + core/src/classical/interpolators/distance2.cu | 281 ++- core/src/classical/interpolators/multipass.cu | 224 +- core/src/classical/strength/affinity.cu | 36 + core/src/matrix_coloring/coloring_utils.cu | 4 + .../matrix_coloring/greedy_min_max_2ring.cu | 59 + core/src/matrix_coloring/min_max.cu | 20 + core/src/matrix_coloring/min_max_2ring.cu | 3 + core/src/scalers/nbinormalization.cu | 57 + core/src/solvers/dense_lu_solver.cu | 70 +- core/src/solvers/kaczmarz_solver.cu | 37 + core/src/solvers/multicolor_dilu_solver.cu | 357 ++- core/src/solvers/multicolor_ilu_solver.cu | 127 +- examples/amgx_spmv_example/Makefile | 2 +- 30 files changed, 1885 insertions(+), 3807 deletions(-) delete mode 100644 base/include/csr_multiply_sm70.h delete mode 100644 base/include/hash_containers_sm70.inl delete mode 100644 base/src/csr_multiply_sm70.cu diff --git a/base/include/csr_multiply_sm35.h b/base/include/csr_multiply_sm35.h index 985b3ee0..40d85c8c 100644 --- a/base/include/csr_multiply_sm35.h +++ b/base/include/csr_multiply_sm35.h @@ -31,7 +31,7 @@ namespace amgx { /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// SM35 implementation of the CSR_Multiply routines. +// SM20 implementation of the CSR_Multiply routines. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template< typename T_Config > diff --git a/base/include/csr_multiply_sm70.h b/base/include/csr_multiply_sm70.h deleted file mode 100644 index bb447ee2..00000000 --- a/base/include/csr_multiply_sm70.h +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2013-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -//#pragma once - -namespace amgx -{ - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// SM70 implementation of the CSR_Multiply routines. -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -template< typename T_Config > -class CSR_Multiply_Sm70 -{}; - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -class CSR_Multiply_Sm70 > : public CSR_Multiply_Impl > -{ - typedef CSR_Multiply_Impl > Base; - public: - typedef typename Base::TConfig_d TConfig_d; - typedef typename Base::Matrix_d Matrix_d; - typedef typename Matrix_d::IVector IVector_d; - typedef typename Base::Value_type Value_type; - typedef typename Matrix_d::IVector IVector; - typedef typename Matrix_d::MVector MVector; - - public: - // Create a workspace to run the product. - CSR_Multiply_Sm70( bool allocate_values = true, int grid_size = 128, int max_warp_count = 8, int gmem_size = 2048 ); - - protected: - // Count the number of non-zero elements. The callee is responsible for setting the work queue value. - void count_non_zeroes( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2 ); - // Compute the sparsity of RAP_int + RAP_ext - void count_non_zeroes_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector &RAP_ext_row_offsets, std::vector &RAP_ext_col_indices, std::vector &RAP_ext_values, std::vector &RAP_ext_row_ids); - - // Compute the ILU1 sparsity of A. - void count_non_zeroes_ilu1( const Matrix_d &A, Matrix_d &B ); - // Compute offsets. - void compute_offsets( Matrix_d &C ); - // Compute the sparsity of the product AxB. - void compute_sparsity( const Matrix_d &A, const Matrix_d &B, Matrix_d &C ); - // Compute the ILU1 sparsity of A. - void compute_sparsity_ilu1( const Matrix_d &A, Matrix_d &B ); - // Compute values. - void compute_values( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, int num_threads, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2 ); - - void compute_values_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector &RAP_ext_row_offsets, std::vector &RAP_ext_col_indices, std::vector &RAP_ext_values, std::vector &RAP_ext_row_ids, int num_threads); - - -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace amgx - diff --git a/base/include/hash_containers_sm70.inl b/base/include/hash_containers_sm70.inl deleted file mode 100644 index 96333ab2..00000000 --- a/base/include/hash_containers_sm70.inl +++ /dev/null @@ -1,1430 +0,0 @@ -/* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -static __constant__ unsigned c_hash_keys[] = -{ - 3499211612, 581869302, 3890346734, 3586334585, - 545404204, 4161255391, 3922919429, 949333985, - 2715962298, 1323567403, 418932835, 2350294565, - 1196140740, 809094426, 2348838239, 4264392720 -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -template< typename Key_type, int SMEM_SIZE = 128, int NUM_HASH_FCTS = 4, int WARP_SIZE = 32 > -class Hash_set -{ - // Associated index. - typedef Hash_index Index; - - protected: - // The size of the table (occupancy). - int m_smem_count, m_gmem_count; - // The keys stored in the hash table. - volatile Key_type *m_smem_keys, *m_gmem_keys; - // The size of the global memory buffer. - const int m_gmem_size; - // Is it ok? - bool m_fail; - - public: - // Constructor. - __device__ __forceinline__ Hash_set( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, int gmem_size ) : - m_smem_count(0), - m_gmem_count(1), - m_smem_keys (smem_keys), - m_gmem_keys (gmem_keys), - m_gmem_size (gmem_size), - m_fail (false) - - {} - - // Clear the table. - __device__ __forceinline__ void clear( bool skip_gmem = false ); - // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value). - __device__ __forceinline__ int compute_size(); - // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value). - __device__ __forceinline__ int compute_size_with_duplicates(); - // Does the set contain those values? - __device__ __forceinline__ bool contains( Key_type key ) const; - // Find an index. - __device__ __forceinline__ int find_index( Key_type key, const Index &index, bool print_debug ) const; - // Has the process failed. - __device__ __forceinline__ bool has_failed() const { return m_fail; } - // Insert a key inside the set. If status is NULL, ignore failure. - __device__ __forceinline__ void insert( Key_type key, int *status ); - // Load a set. - __device__ __forceinline__ void load( int count, const Key_type *keys, const int *pos ); - // Load a set and use it as an index. - __device__ __forceinline__ void load_index( int count, const Key_type *keys, const int *pos, Index &index, bool print_debug ); - // Store a set. - __device__ __forceinline__ void store( int count, Key_type *keys ); - // Store a set. - __device__ __forceinline__ int store_with_positions( Key_type *keys, int *pos ); - // Store a set. - __device__ __forceinline__ int store( Key_type *keys ); -}; - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE> -__device__ __forceinline__ -void Hash_set::clear( bool skip_gmem ) -{ - int lane_id = utils::lane_id(); - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - m_smem_keys[i_step * WARP_SIZE + lane_id] = -1; - } - - m_smem_count = 0; - - if ( skip_gmem || m_gmem_count == 0 ) - { - m_gmem_count = 0; - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - m_gmem_keys[offset] = -1; - } - - m_gmem_count = 0; -} - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE> -__device__ __forceinline__ -int Hash_set::compute_size() -{ - m_smem_count += m_gmem_count; -#pragma unroll - - for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 ) - { - m_smem_count += utils::shfl_xor( m_smem_count, offset ); - } - - m_gmem_count = utils::any( m_gmem_count > 0 ); - return m_smem_count; -} - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE> -__device__ __forceinline__ -int Hash_set::compute_size_with_duplicates() -{ - int lane_id = utils::lane_id(); - // Count the number of keys in SMEM. - int sum = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - sum += __popc( utils::ballot( key != -1 ) ); - } - - // Is there any key in GMEM. If not, just quit. - m_gmem_count = utils::any(m_gmem_count > 0); - - if ( !m_gmem_count ) - { - return sum; - } - - // Count the number of keys in GMEM. -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - sum += __popc( utils::ballot( key != -1, utils::activemask() ) ); - } - - return sum; -} - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE> -__device__ __forceinline__ -bool Hash_set::contains( Key_type key ) const -{ - bool done = key == -1, found = false; -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - return found; - } - - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - found = true; - } - - if ( found || stored_key == -1 ) - { - done = true; - } - } - } - - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - return found; - } - - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - - if ( !done ) - { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - found = true; - } - - if ( found || stored_key == -1 ) - { - done = true; - } - } - } - - return found; -} - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -int Hash_set::find_index( Key_type key, const Index &index, bool print_debug ) const -{ - int idx = -1; - bool done = key == -1; -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - return idx; - } - - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - int result = index.find_smem(hash); - - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - idx = result; - done = true; - } - } - } - - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - return idx; - } - - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - - if ( !done ) - { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - idx = index.find_gmem(hash); - done = true; - } - } - } - - return idx; -} - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_set::insert( Key_type key, int *status ) -{ - bool done = key == -1; -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - return; - } - - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - done = true; - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_smem_keys[hash] = key; - } - - if ( candidate && key == m_smem_keys[hash] ) // More than one candidate may have written to that slot. - { - m_smem_count++; - done = true; - } - } - } - - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - return; - } - - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - - if ( !done ) - { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - done = true; - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_gmem_keys[hash] = key; - } - - if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot. - { - m_gmem_count++; - done = true; - } - } - } - - if ( utils::all(done) ) - { - return; - } - - assert( status != NULL ); - - if ( utils::lane_id() == 0 ) - { - *status = 1; - } - - m_fail = true; -} - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_set::load( int count, const Key_type *keys, const int *pos ) -{ - int lane_id = utils::lane_id(); -#pragma unroll 4 - - for ( int offset = lane_id ; offset < count ; offset += WARP_SIZE ) - { - Key_type key = keys[offset]; - int idx = pos [offset]; - // Where to store the item. - volatile Key_type *ptr = m_smem_keys; - - if ( idx >= SMEM_SIZE ) - { - ptr = m_gmem_keys; - m_gmem_count = 1; - idx -= SMEM_SIZE; - } - - // Store the item. - ptr[idx] = key; - } - - m_gmem_count = utils::any( m_gmem_count ); -} - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_set::load_index( int count, const Key_type *keys, const int *pos, Index &index, bool print_debug ) -{ -#pragma unroll 4 - - for ( int offset = utils::lane_id() ; offset < count ; offset += WARP_SIZE ) - { - Key_type key = keys[offset]; - int idx = pos [offset]; - // Store the item. - volatile Key_type *ptr = m_smem_keys; - - if ( idx >= SMEM_SIZE ) - { - ptr = m_gmem_keys; - m_gmem_count = 1; - idx -= SMEM_SIZE; - index.set_gmem_index( idx, offset ); - } - - // Store the item. - ptr[idx] = key; - } - - // Build the local index. - index.build_smem_index( m_smem_keys ); - m_gmem_count = utils::any( m_gmem_count ); -} - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_set::store( int count, Key_type *keys ) -{ - int lane_id = utils::lane_id(); - int lane_mask_lt = utils::lane_mask_lt(); - int warp_offset = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - int poll = utils::ballot( key != -1 ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - } - - warp_offset += __popc( poll ); - } - - m_gmem_count = utils::any( m_gmem_count > 0 ); - - if ( !m_gmem_count ) - { - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - int poll = utils::ballot( key != -1, utils::activemask() ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - } - - warp_offset += __popc( poll ); - } -} - -// ==================================================================================================================== - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -int Hash_set::store_with_positions( Key_type *keys, int *pos ) -{ - int lane_id = utils::lane_id(); - int lane_mask_lt = utils::lane_mask_lt(); - int warp_offset = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - int poll = utils::ballot( key != -1 ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - pos [dst_offset] = offset; - } - - warp_offset += __popc( poll ); - } - - m_gmem_count = utils::any( m_gmem_count > 0 ); - - if ( !m_gmem_count ) - { - return warp_offset; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - int poll = utils::ballot( key != -1, utils::activemask() ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - pos [dst_offset] = SMEM_SIZE + offset; - } - - warp_offset += __popc( poll ); - } - - return warp_offset; -} - - -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -int Hash_set::store( Key_type *keys ) -{ - int lane_id = utils::lane_id(); - int lane_mask_lt = utils::lane_mask_lt(); - int warp_offset = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - int poll = utils::ballot( key != -1 ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - } - - warp_offset += __popc( poll ); - } - - m_gmem_count = utils::any( m_gmem_count > 0 ); - - if ( !m_gmem_count ) - { - return warp_offset; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - int poll = utils::ballot( key != -1, utils::activemask() ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - } - - warp_offset += __popc( poll ); - } - - return warp_offset; -} - - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -union Word { char b8[4]; int b32; }; - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE = 128, int NUM_HASH_FCTS = 4, int WARP_SIZE = 32 > -class Hash_map -{ - protected: - // The keys stored in the map. - volatile Key_type *m_smem_keys, *m_gmem_keys; - // Vote buffer for values. - volatile Word *m_smem_vote; - // Registers to store values. - T m_regs_vals[4]; - // The values stored in the map. - T *m_gmem_vals; - // The size of the global memory buffer. - const int m_gmem_size; - // Is there any value in GMEM. - bool m_any_gmem; - - public: - // Constructor. - __device__ __forceinline__ - Hash_map( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, volatile Word *smem_vote, T *gmem_vals, int gmem_size ) : - m_smem_keys(smem_keys), - m_gmem_keys(gmem_keys), - m_smem_vote(smem_vote), - m_gmem_vals(gmem_vals), - m_gmem_size(gmem_size), - m_any_gmem (true) - {} - - // Clear the table. It doesn't clear GMEM values. - __device__ __forceinline__ void clear(); - // Clear the table. It also clears GMEM values (set them to 0). - __device__ __forceinline__ void clear_all(); - // Insert a key/value inside the hash table. - __device__ __forceinline__ void insert( Key_type key, T a_value, T b_value, int *status ); - // Insert a key/value inside the hash table. - __device__ __forceinline__ void insert_with_duplicates( Key_type key, T val, int *status ); - // Load a set. - __device__ __forceinline__ void load( int count, const Key_type *keys, const int *pos ); - // Store the map. - __device__ __forceinline__ void store( int count, T *vals ); - // Store the map. - __device__ __forceinline__ void store( int count, Key_type *keys, T *vals ); - // Store the map. - __device__ __forceinline__ void store_map_keys_scale_values( int count, const int *map, Key_type *keys, T alpha, T *vals ); - // Store the map. - __device__ __forceinline__ void store_keys_scale_values( int count, Key_type *keys, T alpha, T *vals ); - // Update a value in the table but do not insert if it doesn't exist. - __device__ __forceinline__ bool update( Key_type key, T value ); - - protected: - // Get the selected item in the register buffer. - __device__ __forceinline__ int get_selected( int hash ) const - { - return static_cast(m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE]); - } - - // Is it the selected item in the register buffer. - __device__ __forceinline__ bool is_selected( int hash, int lane_id ) const - { - return m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE] == reinterpret_cast(lane_id); - } - - // Push my ID in the register buffer. - __device__ __forceinline__ void try_selection( int hash, int lane_id ) - { - m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE] = reinterpret_cast(lane_id); - } -}; - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::clear() -{ - int lane_id = utils::lane_id(); - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - m_smem_keys[i_step * WARP_SIZE + lane_id] = -1; - } - -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - m_regs_vals[i_regs] = amgx::types::util::get_zero(); - } - - if ( !m_any_gmem ) - { - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - m_gmem_keys[offset] = -1; - } - - m_any_gmem = false; -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::clear_all() -{ - int lane_id = utils::lane_id(); - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - m_smem_keys[i_step * WARP_SIZE + lane_id] = -1; - } - -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - m_regs_vals[i_regs] = amgx::types::util::get_zero(); - } - - if ( !m_any_gmem ) - { - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - m_gmem_keys[offset] = -1; - m_gmem_vals[offset] = amgx::types::util::get_zero(); - } - - m_any_gmem = false; -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::insert( Key_type key, T a_value, T b_value, int *status ) -{ - const int lane_id = utils::lane_id(); - bool done = key == -1; - m_smem_vote[lane_id].b32 = 0x20202020; -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( i_hash > 0 && utils::all(done) ) - { - break; - } - - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - this->try_selection( hash, lane_id ); - done = true; - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_smem_keys[hash] = key; - } - - if ( candidate && key == m_smem_keys[hash] ) - { - this->try_selection( hash, lane_id ); - done = true; - } - } - } - - Word my_vote; - my_vote.b32 = m_smem_vote[lane_id].b32; -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - int my_src = my_vote.b8[i_regs]; - T other_val = utils::shfl( b_value, my_src ); - - if ( my_src != WARP_SIZE ) - { - m_regs_vals[i_regs] = m_regs_vals[i_regs] + a_value * other_val; - } - } - - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - return; - } - - m_any_gmem = true; - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - - if ( !done ) - { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - m_gmem_vals[hash] = m_gmem_vals[hash] + a_value * b_value; - done = true; - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_gmem_keys[hash] = key; - } - - if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot. - { - m_gmem_vals[hash] = a_value * b_value; - done = true; - } - } - } - - if ( status == NULL || utils::all(done) ) - { - return; - } - - if ( lane_id == 0 ) - { - status[0] = 1; - } -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::insert_with_duplicates( Key_type key, T val, int *status ) -{ - const int lane_id = utils::lane_id(); - bool done = key == -1; - m_smem_vote[lane_id].b32 = 0x20202020; -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - break; - } - - bool candidate = false; - bool maybe_in_conflict = false; - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - this->try_selection( hash, lane_id ); - maybe_in_conflict = true; - done = true; // Is it really done??? - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_smem_keys[hash] = key; - } - - if ( candidate && key == m_smem_keys[hash] ) - { - this->try_selection( hash, lane_id ); - maybe_in_conflict = true; - done = true; - } - } - - // Fix conflicts. - bool in_conflict = maybe_in_conflict && !this->is_selected(hash, lane_id); - - while ( utils::any( in_conflict ) ) - { - int winner = in_conflict ? this->get_selected(hash) : WARP_SIZE; - T other_val = utils::shfl( val, winner ); - - if ( in_conflict ) - { - this->try_selection(hash, lane_id); - } - - if ( in_conflict && this->is_selected(hash, lane_id) ) - { - val = val + other_val; - in_conflict = false; - } - } - } - - Word my_vote; - my_vote.b32 = m_smem_vote[lane_id].b32; -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - int my_src = my_vote.b8[i_regs]; - T other_val = utils::shfl( val, my_src ); - - if ( my_src != WARP_SIZE ) - { - m_regs_vals[i_regs] = m_regs_vals[i_regs] + other_val; - } - } - - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - return; - } - - m_any_gmem = true; - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - - if ( !done ) - { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - utils::atomic_add( &m_gmem_vals[hash], val ); - done = true; - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_gmem_keys[hash] = key; - } - - if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot. - { - utils::atomic_add( &m_gmem_vals[hash], val ); - done = true; - } - } - } - - if ( status == NULL || utils::all(done) ) - { - return; - } - - if ( lane_id == 0 ) - { - status[0] = 1; - } -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::load( int count, const Key_type *keys, const int *pos ) -{ - int lane_id = utils::lane_id(); -#pragma unroll 4 - - for ( int offset = lane_id ; offset < count ; offset += WARP_SIZE ) - { - Key_type key = keys[offset]; - int idx = pos [offset]; - // Where to store the item. - volatile Key_type *ptr = m_smem_keys; - - if ( idx >= SMEM_SIZE ) - { - ptr = m_gmem_keys; - m_any_gmem = 1; - idx -= SMEM_SIZE; - m_gmem_vals[idx] = amgx::types::util::get_zero(); - } - - // Store the item. - ptr[idx] = key; - } - - m_any_gmem = utils::any( m_any_gmem ); -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::store( int count, T *vals ) -{ - int lane_id = utils::lane_id(); - int lane_mask_lt = utils::lane_mask_lt(); - int warp_offset = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - int poll = utils::ballot( key != -1 ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - vals[dst_offset] = m_regs_vals[i_step]; - } - - warp_offset += __popc( poll ); - } - - if ( !m_any_gmem ) - { - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - int poll = utils::ballot( key != -1, utils::activemask() ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - vals[dst_offset] = m_gmem_vals[offset]; - } - - warp_offset += __popc( poll ); - } -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::store( int count, Key_type *keys, T *vals ) -{ - int lane_id = utils::lane_id(); - int lane_mask_lt = utils::lane_mask_lt(); - int warp_offset = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - int poll = utils::ballot( key != -1 ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - vals[dst_offset] = m_regs_vals[i_step]; - } - - warp_offset += __popc( poll ); - } - - if ( !m_any_gmem ) - { - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - int poll = utils::ballot( key != -1, utils::activemask() ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - vals[dst_offset] = m_gmem_vals[offset]; - } - - warp_offset += __popc( poll ); - } -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::store_map_keys_scale_values( int count, const int *map, Key_type *keys, T alpha, T *vals ) -{ - int lane_id = utils::lane_id(); - int lane_mask_lt = utils::lane_mask_lt(); - int warp_offset = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - int poll = utils::ballot( key != -1 ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = map[key]; - vals[dst_offset] = alpha * m_regs_vals[i_step]; - } - - warp_offset += __popc( poll ); - } - - if ( !m_any_gmem ) - { - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - int poll = utils::ballot( key != -1, utils::activemask() ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = map[key]; - vals[dst_offset] = alpha * m_gmem_vals[offset]; - } - - warp_offset += __popc( poll ); - } -} - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::store_keys_scale_values( int count, Key_type *keys, T alpha, T *vals ) -{ - int lane_id = utils::lane_id(); - int lane_mask_lt = utils::lane_mask_lt(); - int warp_offset = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - int poll = utils::ballot( key != -1 ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - vals[dst_offset] = alpha * m_regs_vals[i_step]; - } - - warp_offset += __popc( poll ); - } - - if ( !m_any_gmem ) - { - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - int poll = utils::ballot( key != -1, utils::activemask() ); - - if ( poll == 0 ) - { - continue; - } - - int dst_offset = warp_offset + __popc( poll & lane_mask_lt ); - - if ( key != -1 ) - { - keys[dst_offset] = key; - vals[dst_offset] = alpha * m_gmem_vals[offset]; - } - - warp_offset += __popc( poll ); - } -} - - - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -bool Hash_map::update( Key_type key, T val ) -{ - const int lane_id = utils::lane_id(); - bool done = key == -1, found = false; - m_smem_vote[lane_id].b32 = 0x20202020; -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( i_hash > 0 && utils::all(done) ) - { - break; - } - - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - this->try_selection( hash, lane_id ); - found = true; - } - - done = found || stored_key == -1; - } - } - - Word my_vote; - my_vote.b32 = m_smem_vote[lane_id].b32; -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - int my_src = my_vote.b8[i_regs]; - T other_val = utils::shfl( val, my_src ); - - if ( my_src != WARP_SIZE ) - { - m_regs_vals[i_regs] += other_val; - } - } - - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) - { - return found; - } - - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - - if ( !done ) - { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - m_gmem_vals[hash] += val; - found = true; - } - - done = found || stored_key == -1; - } - } - - return found; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/base/include/ld_functions.h b/base/include/ld_functions.h index 6b33771f..615cbb9e 100644 --- a/base/include/ld_functions.h +++ b/base/include/ld_functions.h @@ -102,10 +102,18 @@ __device__ __inline__ int ld_cs(const int *address) return reg; } +#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 350) +template +__device__ __inline T ldg(const T *address) +{ + return ld_cg(address); +} +#else template __device__ __inline T ldg(const T *address) { return __ldg(address); } +#endif } //end namespace amgx diff --git a/base/include/memory_intrinsics.h b/base/include/memory_intrinsics.h index a2288f9f..d04e3a8c 100644 --- a/base/include/memory_intrinsics.h +++ b/base/include/memory_intrinsics.h @@ -36,7 +36,11 @@ static __device__ __inline__ int __load_all(const int *ptr) { int ret; asm volat static __device__ __inline__ int __load_global(const int *ptr) { int ret; asm volatile ("ld.global.cg.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ int __load_streaming(const int *ptr) { int ret; asm volatile ("ld.global.cs.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ int __load_lastuse(const int *ptr) { int ret; asm volatile ("ld.global.lu.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +#if __CUDA_ARCH__ >= 350 static __device__ __inline__ int __load_nc(const int *ptr) { int ret; asm volatile ("ld.global.nc.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +#else +static __device__ __inline__ int __load_nc(const int *ptr) { int ret; asm volatile ("ld.global.ca.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +#endif static __device__ __inline__ int __load_uniform(const int *ptr) { int ret; asm volatile ("ld.global.s32.uniform %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } @@ -44,7 +48,11 @@ static __device__ __inline__ int2 __load_all(const int2 *ptr) { int2 ret; asm vo static __device__ __inline__ int2 __load_global(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cg.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ int2 __load_streaming(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cs.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ int2 __load_lastuse(const int2 *ptr) { int2 ret; asm volatile ("ld.global.lu.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +#if __CUDA_ARCH__ >= 350 static __device__ __inline__ int2 __load_nc(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +#else +static __device__ __inline__ int2 __load_nc(const int2 *ptr) { int2 ret; asm volatile ("ld.global.ca.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +#endif static __device__ __inline__ int2 __load_uniform(const int2 *ptr) { int2 ret; asm volatile ("ld.global.s32.v2.uniform {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } @@ -52,7 +60,11 @@ static __device__ __inline__ int4 __load_all(const int4 *ptr) { int4 ret; asm vo static __device__ __inline__ int4 __load_global(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ int4 __load_streaming(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ int4 __load_lastuse(const int4 *ptr) { int4 ret; asm volatile ("ld.global.lu.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +#if __CUDA_ARCH__ >= 350 static __device__ __inline__ int4 __load_nc(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +#else +static __device__ __inline__ int4 __load_nc(const int4 *ptr) { int4 ret; asm volatile ("ld.global.ca.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +#endif static __device__ __inline__ int4 __load_uniform(const int4 *ptr) { int4 ret; asm volatile ("ld.global.v4.s32.uniform {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } @@ -60,7 +72,11 @@ static __device__ __inline__ float __load_all(const float *ptr) { float ret; asm static __device__ __inline__ float __load_global(const float *ptr) { float ret; asm volatile ("ld.global.cg.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ float __load_streaming(const float *ptr) { float ret; asm volatile ("ld.global.cs.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ float __load_lastuse(const float *ptr) { float ret; asm volatile ("ld.global.lu.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } +#if __CUDA_ARCH__ >= 350 +static __device__ __inline__ float __load_nc(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } +#else static __device__ __inline__ float __load_nc(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } +#endif static __device__ __inline__ float __load_uniform(const float *ptr) { float ret; asm volatile ("ld.global.f32.uniform %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } @@ -68,7 +84,11 @@ static __device__ __inline__ float2 __load_all(const float2 *ptr) { float2 ret; static __device__ __inline__ float2 __load_global(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cg.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ float2 __load_streaming(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cs.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ float2 __load_lastuse(const float2 *ptr) { float2 ret; asm volatile ("ld.global.lu.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } +#if __CUDA_ARCH__ >= 350 static __device__ __inline__ float2 __load_nc(const float2 *ptr) { float2 ret; asm volatile ("ld.global.nc.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } +#else +static __device__ __inline__ float2 __load_nc(const float2 *ptr) { float2 ret; asm volatile ("ld.global.ca.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } +#endif static __device__ __inline__ float2 __load_uniform(const float2 *ptr) { float2 ret; asm volatile ("ld.global.v2.f32.uniform {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } @@ -76,7 +96,11 @@ static __device__ __inline__ float4 __load_all(const float4 *ptr) { float4 ret; static __device__ __inline__ float4 __load_global(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ float4 __load_streaming(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cs.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ float4 __load_lastuse(const float4 *ptr) { float4 ret; asm volatile ("ld.global.lu.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } +#if __CUDA_ARCH__ >= 350 static __device__ __inline__ float4 __load_nc(const float4 *ptr) { float4 ret; asm volatile ("ld.global.nc.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } +#else +static __device__ __inline__ float4 __load_nc(const float4 *ptr) { float4 ret; asm volatile ("ld.global.ca.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } +#endif static __device__ __inline__ float4 __load_uniform(const float4 *ptr) { float4 ret; asm volatile ("ld.global.v4.f32.uniform {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } @@ -84,7 +108,11 @@ static __device__ __inline__ double __load_all(const double *ptr) { double ret; static __device__ __inline__ double __load_global(const double *ptr) { double ret; asm volatile ("ld.global.cg.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ double __load_streaming(const double *ptr) { double ret; asm volatile ("ld.global.cs.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ double __load_lastuse(const double *ptr) { double ret; asm volatile ("ld.global.lu.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } +#if __CUDA_ARCH__ >= 350 static __device__ __inline__ double __load_nc(const double *ptr) { double ret; asm volatile ("ld.global.nc.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } +#else +static __device__ __inline__ double __load_nc(const double *ptr) { double ret; asm volatile ("ld.global.ca.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } +#endif static __device__ __inline__ double __load_uniform(const double *ptr) { double ret; asm volatile ("ld.global.f64.uniform %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } @@ -92,5 +120,9 @@ static __device__ __inline__ double2 __load_all(const double2 *ptr) { double2 re static __device__ __inline__ double2 __load_global(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cg.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ double2 __load_streaming(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cs.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } static __device__ __inline__ double2 __load_lastuse(const double2 *ptr) { double2 ret; asm volatile ("ld.global.lu.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } +#if __CUDA_ARCH__ >= 350 static __device__ __inline__ double2 __load_nc(const double2 *ptr) { double2 ret; asm volatile ("ld.global.nc.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } +#else +static __device__ __inline__ double2 __load_nc(const double2 *ptr) { double2 ret; asm volatile ("ld.global.ca.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } +#endif static __device__ __inline__ double2 __load_uniform(const double2 *ptr) { double2 ret; asm volatile ("ld.global.v2.f64.uniform {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } diff --git a/base/include/sm_utils.inl b/base/include/sm_utils.inl index 7b71854b..5d07ae45 100644 --- a/base/include/sm_utils.inl +++ b/base/include/sm_utils.inl @@ -275,29 +275,41 @@ static __device__ __forceinline__ void load_vec4( double (&u)[4], const double * // ==================================================================================================================== static __device__ __forceinline__ unsigned int ballot(int p, unsigned int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __ballot_sync(mask, p); #else return __ballot(p); #endif +#else + return 0; +#endif } static __device__ __forceinline__ unsigned int any(int p, unsigned int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __any_sync(mask, p); #else return __any(p); #endif +#else + return 0; +#endif } static __device__ __forceinline__ unsigned int all(int p, unsigned int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __all_sync(mask, p); #else return __all(p); #endif +#else + return 0; +#endif } static __device__ __forceinline__ unsigned int activemask() @@ -323,25 +335,34 @@ static __device__ __forceinline__ void syncwarp(unsigned int mask = 0xffffffff) // ==================================================================================================================== static __device__ __forceinline__ int shfl( int r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __shfl_sync( mask, r, lane, bound ); #else return __shfl( r, lane, bound ); #endif +#else // __CUDA_ARCH__ + return 0.0f; +#endif } static __device__ __forceinline__ float shfl( float r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __shfl_sync( mask, r, lane, bound ); #else return __shfl( r, lane, bound ); #endif +#else // __CUDA_ARCH__ + return 0.0f; +#endif } static __device__ __forceinline__ double shfl( double r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 int hi = __shfl_sync(mask, __double2hiint(r), lane, bound ); int lo = __shfl_sync(mask, __double2loint(r), lane, bound ); @@ -351,10 +372,14 @@ static __device__ __forceinline__ double shfl( double r, int lane, int bound = w int lo = __shfl( __double2loint(r), lane, bound ); return __hiloint2double( hi, lo ); #endif +#else + return 0.0; +#endif } static __device__ __forceinline__ cuComplex shfl( cuComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 float re = __shfl_sync( mask, cuCrealf(r), lane, bound ); float im = __shfl_sync( mask, cuCimagf(r), lane, bound ); @@ -364,36 +389,52 @@ static __device__ __forceinline__ cuComplex shfl( cuComplex r, int lane, int bou float im = __shfl( cuCimagf(r), lane, bound ); return make_cuComplex(re, im); #endif +#else + return amgx::types::util::get_zero(); +#endif } static __device__ __forceinline__ cuDoubleComplex shfl( cuDoubleComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 double re = shfl( cuCreal(r), lane, mask, bound ); double im = shfl( cuCimag(r), lane, mask, bound ); return make_cuDoubleComplex( re, im ); +#else + return amgx::types::util::get_zero(); +#endif } static __device__ __forceinline__ int shfl_xor( int r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __shfl_xor_sync( mask, r, lane_mask, bound ); #else return __shfl_xor( r, lane_mask, bound ); #endif +#else + return 0.0f; +#endif } static __device__ __forceinline__ float shfl_xor( float r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __shfl_xor_sync( mask, r, lane_mask, bound ); #else return __shfl_xor( r, lane_mask, bound ); #endif +#else + return 0.0f; +#endif } static __device__ __forceinline__ double shfl_xor( double r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 int hi = __shfl_xor_sync( mask, __double2hiint(r), lane_mask, bound ); int lo = __shfl_xor_sync( mask, __double2loint(r), lane_mask, bound ); @@ -403,10 +444,14 @@ static __device__ __forceinline__ double shfl_xor( double r, int lane_mask, int int lo = __shfl_xor( __double2loint(r), lane_mask, bound ); return __hiloint2double( hi, lo ); #endif +#else + return 0.0; +#endif } static __device__ __forceinline__ cuComplex shfl_xor( cuComplex r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 float re = __shfl_xor_sync( mask, cuCrealf(r), lane_mask, bound ); float im = __shfl_xor_sync( mask, cuCimagf(r), lane_mask, bound ); @@ -416,35 +461,51 @@ static __device__ __forceinline__ cuComplex shfl_xor( cuComplex r, int lane_mask float im = __shfl_xor( cuCimagf(r), lane_mask, bound ); return make_cuComplex(re, im); #endif +#else + return amgx::types::util::get_zero(); +#endif } static __device__ __forceinline__ cuDoubleComplex shfl_xor( cuDoubleComplex r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 double re = shfl_xor( cuCreal(r), lane_mask, mask, bound ); double im = shfl_xor( cuCimag(r), lane_mask, mask, bound ); return make_cuDoubleComplex( re, im ); +#else + return amgx::types::util::get_zero(); +#endif } static __device__ __forceinline__ int shfl_down( int r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __shfl_down_sync( mask, r, offset, bound ); #else return __shfl_down( r, offset, bound ); #endif +#else + return 0.0f; +#endif } static __device__ __forceinline__ float shfl_down( float r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __shfl_down_sync( mask, r, offset, bound ); #else return __shfl_down( r, offset, bound ); #endif +#else + return 0.0f; +#endif } static __device__ __forceinline__ double shfl_down( double r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 int hi = __shfl_down_sync( mask, __double2hiint(r), offset, bound ); int lo = __shfl_down_sync( mask, __double2loint(r), offset, bound ); @@ -454,10 +515,14 @@ static __device__ __forceinline__ double shfl_down( double r, int offset, int bo int lo = __shfl_down( __double2loint(r), offset, bound ); return __hiloint2double( hi, lo ); #endif +#else + return 0.0; +#endif } static __device__ __forceinline__ cuComplex shfl_down( cuComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 float re = __shfl_down_sync( mask, cuCrealf(r), lane, bound ); float im = __shfl_down_sync( mask, cuCimagf(r), lane, bound ); @@ -467,36 +532,52 @@ static __device__ __forceinline__ cuComplex shfl_down( cuComplex r, int lane, in float im = __shfl_down( cuCimagf(r), lane, bound ); return make_cuComplex(re, im); #endif +#else + return amgx::types::util::get_zero(); +#endif } static __device__ __forceinline__ cuDoubleComplex shfl_down( cuDoubleComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 double re = shfl_down( cuCreal(r), lane, bound ); double im = shfl_down( cuCimag(r), lane, bound ); return make_cuDoubleComplex( re, im ); +#else + return amgx::types::util::get_zero(); +#endif } static __device__ __forceinline__ int shfl_up( int r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __shfl_up_sync( mask, r, offset, bound ); #else return __shfl_up( r, offset, bound ); #endif +#else + return 0.0f; +#endif } static __device__ __forceinline__ float shfl_up( float r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 return __shfl_up_sync( mask, r, offset, bound ); #else return __shfl_up( r, offset, bound ); #endif +#else + return 0.0f; +#endif } static __device__ __forceinline__ double shfl_up( double r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 int hi = __shfl_up_sync( mask, __double2hiint(r), offset, bound ); int lo = __shfl_up_sync( mask, __double2loint(r), offset, bound ); @@ -506,10 +587,14 @@ static __device__ __forceinline__ double shfl_up( double r, int offset, int boun int lo = __shfl_up( __double2loint(r), offset, bound ); return __hiloint2double( hi, lo ); #endif +#else + return 0.0; +#endif } static __device__ __forceinline__ cuComplex shfl_up( cuComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 #if CUDART_VERSION >= 9000 float re = __shfl_up_sync( mask, cuCrealf(r), lane, bound ); float im = __shfl_up_sync( mask, cuCimagf(r), lane, bound ); @@ -519,13 +604,20 @@ static __device__ __forceinline__ cuComplex shfl_up( cuComplex r, int lane, int float im = __shfl_up( cuCimagf(r), lane, bound ); return make_cuComplex(re, im); #endif +#else + return amgx::types::util::get_zero(); +#endif } static __device__ __forceinline__ cuDoubleComplex shfl_up( cuDoubleComplex r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { +#if __CUDA_ARCH__ >= 300 double re = shfl_up( cuCreal(r), lane, bound ); double im = shfl_up( cuCimag(r), lane, bound ); return make_cuDoubleComplex( re, im ); +#else + return amgx::types::util::get_zero(); +#endif } // ==================================================================================================================== @@ -538,6 +630,8 @@ struct Add static __device__ __forceinline__ Value_type eval( Value_type x, Value_type y ) { return x + y; } }; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + template< int NUM_THREADS_PER_ITEM, int WarpSize > struct Warp_reduce_pow2 { @@ -579,6 +673,51 @@ struct Warp_reduce_linear } }; +#else + +template< int NUM_THREADS_PER_ITEM, int WarpSize > +struct Warp_reduce_pow2 +{ + template< typename Operator, typename Value_type > + static __device__ __inline__ Value_type execute( volatile Value_type *smem, Value_type x ) + { + int my_lane_id = utils::lane_id(); +#pragma unroll + + for ( int offset = WarpSize / 2 ; offset >= NUM_THREADS_PER_ITEM ; offset >>= 1 ) + if ( my_lane_id < offset ) + { + x = Operator::eval( x, smem[threadIdx.x + offset] ); + amgx::types::util::volcast(x, smem + threadIdx.x); + } + + return x; + } +}; + +template< int NUM_THREADS_PER_ITEM, int WarpSize > +struct Warp_reduce_linear +{ + template< typename Operator, typename Value_type > + static __device__ __inline__ Value_type execute( volatile Value_type *smem, Value_type x ) + { + const int NUM_STEPS = WarpSize / NUM_THREADS_PER_ITEM; + int my_lane_id = utils::lane_id(); +#pragma unroll + + for ( int i = 1 ; i < NUM_STEPS ; ++i ) + if ( my_lane_id < NUM_THREADS_PER_ITEM ) + { + x = Operator::eval( x, smem[threadIdx.x + i * NUM_THREADS_PER_ITEM] ); + amgx::types::util::volcast(x, smem + threadIdx.x); + } + + return x; + } +}; + +#endif + // ==================================================================================================================== template< int NUM_THREADS_PER_ITEM, int WarpSize = 32 > @@ -622,12 +761,41 @@ struct Warp_reduce<15, WarpSize> : public Warp_reduce_linear<15, WarpSize> {}; // ==================================================================================================================== +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + template< int NUM_THREADS_PER_ITEM, typename Operator, typename Value_type > static __device__ __forceinline__ Value_type warp_reduce( Value_type x ) { return Warp_reduce::template execute( x ); } +#else + +template< int NUM_THREADS_PER_ITEM, typename Operator, typename Value_type > +static __device__ __forceinline__ Value_type warp_reduce( volatile Value_type *smem, Value_type x ) +{ + return Warp_reduce::template execute( smem, x ); +} + +template< int NUM_THREADS_PER_ITEM, typename Value_type, int WarpSize = 32 > +static __device__ __forceinline__ Value_type warp_reduce_sum(volatile Value_type *smem, Value_type x) +{ + const int NUM_STEPS = WarpSize / NUM_THREADS_PER_ITEM; + int my_lane_id = utils::lane_id(); +#pragma unroll + + for (int i = 1; i < NUM_STEPS; ++i) + if (my_lane_id < NUM_THREADS_PER_ITEM) + { + x = x + amgx::types::util::volcast(smem[threadIdx.x + i * NUM_THREADS_PER_ITEM]); + amgx::types::util::volcast(x, smem + threadIdx.x); + } + + return x; +} + +#endif + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace utils diff --git a/base/include/strided_reduction.h b/base/include/strided_reduction.h index 2a64541a..df5ca976 100644 --- a/base/include/strided_reduction.h +++ b/base/include/strided_reduction.h @@ -99,7 +99,12 @@ template struct is_pow2 { static const int value = (I & (I - 1)) == 0; // detect arch #define strided_reduction_arch_KEPLER 2 +#define strided_reduction_arch_FERMI 1 +#if __CUDA_ARCH__ >= 350 #define strided_reduction_ARCH strided_reduction_arch_KEPLER +#else +#define strided_reduction_ARCH strided_reduction_arch_FERMI +#endif //ops struct op_sum @@ -391,6 +396,34 @@ struct warputil<8, strided_reduction_arch_KEPLER> } }; +template +struct warputil +{ + template SR_SDF void warp_reduce_stride(T &value, const OP &op) + { + __shared__ volatile T s_buf[CTA_SIZE + WARP_SIZE]; + const int laneId = utils::lane_id(); + { + s_buf[threadIdx.x] = value; +#pragma unroll + for (int i = STRIDE; i < NVALS; i *= 2) + { + T tmp = s_buf[threadIdx.x + i]; + + if (laneId + i < WARP_SIZE) + { + value = op.compute(value, tmp); + } + + s_buf[threadIdx.x] = value; + } + + if (STRIDE & (STRIDE - 1) == 0) { value = s_buf[threadIdx.x - laneId + (threadIdx.x & (STRIDE - 1))]; } + } + } +}; + + ///////////////////////////////// // STRUDED REDUCTION KERNEL ///////////////////////////////// @@ -399,10 +432,10 @@ struct warputil<8, strided_reduction_arch_KEPLER> template __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 2 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 2 ) +__launch_bounds__( CTA_SIZE, 1 ) #endif void strided_reduction(const T *X, const int N, V *sums, const TRANSFORM tx = TRANSFORM(), const OP op = OP()) { diff --git a/base/include/texture.h b/base/include/texture.h index b11dd4e2..b4913b38 100644 --- a/base/include/texture.h +++ b/base/include/texture.h @@ -38,7 +38,11 @@ namespace amgx template __inline__ __device__ T_ELEM __cachingLoad(const T_ELEM *addr) { +#if __CUDA_ARCH__ < 350 + return *addr; +#else return __ldg(addr); +#endif } } diff --git a/base/src/classical/selectors/selector.cu b/base/src/classical/selectors/selector.cu index 9882489a..0a3fccd4 100644 --- a/base/src/classical/selectors/selector.cu +++ b/base/src/classical/selectors/selector.cu @@ -119,36 +119,36 @@ void resolve_boundary(const IndexType *offsets, const IndexType *column_indices, } } -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -namespace selector_sm35 +namespace selector_sm20 { #include -#include // Included inside the namespace to solve name colisions. +#include // Included inside the namespace to solve name colisions. -__device__ __forceinline__ int get_work( int *queue, int warp_id ) +__device__ __forceinline__ int get_work( volatile int *offsets, int *queue, int warp_id ) { - int offset = -1; - if ( utils::lane_id() == 0 ) { - offset = atomicAdd( queue, 1 ); + offsets[warp_id] = atomicAdd( queue, 1 ); } - return utils::shfl( offset, 0 ); + return offsets[warp_id]; } -} +} // namespace distance2_sm20 + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -namespace selector_sm70 +namespace selector_sm35 { #include -#include // Included inside the namespace to solve name colisions. +#include // Included inside the namespace to solve name colisions. __device__ __forceinline__ int get_work( int *queue, int warp_id ) { +#if __CUDA_ARCH__ >= 300 int offset = -1; if ( utils::lane_id() == 0 ) @@ -157,10 +157,14 @@ __device__ __forceinline__ int get_work( int *queue, int warp_id ) } return utils::shfl( offset, 0 ); +#else + return 0; +#endif } } + namespace selector { @@ -431,22 +435,30 @@ compute_c_hat_kernel( int A_num_rows, __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; +#if __CUDA_ARCH__ >= 300 +#else + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. -#if __CUDA_ARCH__ >= 700 - amgx::classical::selector_sm70::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); -#else +#if __CUDA_ARCH__ >= 300 amgx::classical::selector_sm35::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); +#else + amgx::classical::selector_sm20::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); #endif // Loop over rows of A. -#if __CUDA_ARCH__ >= 700 - for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm70::get_work( wk_work_queue, warp_id ) ) -#else +#if __CUDA_ARCH__ >= 300 + for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm35::get_work( wk_work_queue, warp_id ) ) +#else + for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm20::get_work( s_offsets, wk_work_queue, warp_id ) ) #endif { // Skip fine rows. @@ -461,6 +473,7 @@ compute_c_hat_kernel( int A_num_rows, set.clear(); // Load the range of the row. __syncthreads(); +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -470,6 +483,16 @@ compute_c_hat_kernel( int A_num_rows, int a_col_begin = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + + int a_col_begin = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif __syncthreads(); // _iterate over the columns of A to build C_hat. @@ -592,6 +615,13 @@ compute_c_hat_kernel( int A_num_rows, __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; +#if __CUDA_ARCH__ >= 300 +#else + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -601,16 +631,17 @@ compute_c_hat_kernel( int A_num_rows, // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. -#if __CUDA_ARCH__ >= 700 - amgx::classical::selector_sm70::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); -#else +#if __CUDA_ARCH__ >= 300 amgx::classical::selector_sm35::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); +#else + amgx::classical::selector_sm20::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); #endif // Loop over rows of A. -#if __CUDA_ARCH__ >= 700 - for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm70::get_work( wk_work_queue, warp_id ) ) -#else +#if __CUDA_ARCH__ >= 300 + for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm35::get_work( wk_work_queue, warp_id ) ) +#else + for ( ; a_row_id < A_num_rows ; a_row_id = amgx::classical::selector_sm20::get_work( s_offsets, wk_work_queue, warp_id ) ) #endif { // Skip fine rows. @@ -624,6 +655,7 @@ compute_c_hat_kernel( int A_num_rows, // Clear the set. set.clear(); // Load the range of the row. +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -633,6 +665,16 @@ compute_c_hat_kernel( int A_num_rows, int a_col_begin = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + + int a_col_begin = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif // _iterate over the columns of A to build C_hat. for ( int a_col_it = a_col_begin + lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE ) diff --git a/base/src/csr_multiply.cu b/base/src/csr_multiply.cu index 616fcb43..59ca324f 100644 --- a/base/src/csr_multiply.cu +++ b/base/src/csr_multiply.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2013-2020, NVIDIA CORPORATION. All rights reserved. +/* Copyright (c) 2013-2017, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,8 +26,8 @@ */ #include +#include #include -#include #include #include #include @@ -43,16 +43,16 @@ void *CSR_Multiply >::csr_workspace_create( cudaDeviceProp props = getDeviceProperties(); int arch = 10 * props.major + props.minor; - if ( arch >= 70 ) - { - return new CSR_Multiply_Sm70(); - } if ( arch >= 35 ) { return new CSR_Multiply_Sm35(); } + else if ( arch >= 20 ) + { + return new CSR_Multiply_Sm20(); + } - FatalError( "CSR_Multiply: Unsupported architecture. It requires a Kepler GPU or newer!!!", AMGX_ERR_NOT_SUPPORTED_BLOCKSIZE ); + FatalError( "CSR_Multiply: Unsupported architecture. It requires a Fermi GPU or newer!!!", AMGX_ERR_NOT_SUPPORTED_BLOCKSIZE ); } // ==================================================================================================================== @@ -64,20 +64,20 @@ void *CSR_Multiply >::csr_workspace_create( cudaDeviceProp props = getDeviceProperties(); int arch = 10 * props.major + props.minor; - if ( arch >= 70 ) + if ( arch >= 35 ) { - CSR_Multiply_Sm70 *wk = new CSR_Multiply_Sm70(); + CSR_Multiply_Sm35 *wk = new CSR_Multiply_Sm35(); wk->set_max_attempts(max_attempts); return wk; } - if ( arch >= 35 ) + else if ( arch >= 20 ) { - CSR_Multiply_Sm35 *wk = new CSR_Multiply_Sm35(); + CSR_Multiply_Sm20 *wk = new CSR_Multiply_Sm20(); wk->set_max_attempts(max_attempts); return wk; } - FatalError( "CSR_Multiply: Unsupported architecture. It requires a Kepler GPU or newer!!!", AMGX_ERR_NOT_SUPPORTED_BLOCKSIZE ); + FatalError( "CSR_Multiply: Unsupported architecture. It requires a Fermi GPU or newer!!!", AMGX_ERR_NOT_SUPPORTED_BLOCKSIZE ); } // ==================================================================================================================== diff --git a/base/src/csr_multiply_sm35.cu b/base/src/csr_multiply_sm35.cu index b86d76cc..c8376ec5 100644 --- a/base/src/csr_multiply_sm35.cu +++ b/base/src/csr_multiply_sm35.cu @@ -31,7 +31,6 @@ #include #include #include -#include /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -57,6 +56,7 @@ void flag_halo_rows(int *row_ids, int size, int *flagArray, int neighbor, int gl __device__ __forceinline__ int get_work( int *queue, int warp_id ) { +#if __CUDA_ARCH__ >= 300 int offset = -1; if ( utils::lane_id() == 0 ) @@ -65,6 +65,9 @@ __device__ __forceinline__ int get_work( int *queue, int warp_id ) } return utils::shfl( offset, 0 ); +#else + return 1 << 28; +#endif } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -88,6 +91,7 @@ count_non_zeroes_kernel( const int A_num_rows, int *wk_work_queue, int *wk_status ) { +#if __CUDA_ARCH__ >= 350 const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; @@ -229,6 +233,8 @@ count_non_zeroes_kernel( const int A_num_rows, set.store( count, &C_cols[c_col_it] ); } } + +#endif } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -252,6 +258,7 @@ count_non_zeroes_kernel( const int A_num_rows, int *wk_work_queue, int *wk_status ) { +#if __CUDA_ARCH__ >= 350 const int NUM_WARPS = CTA_SIZE / WARP_SIZE; const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; // The hash keys stored in shared memory. @@ -399,6 +406,8 @@ count_non_zeroes_kernel( const int A_num_rows, set.store( count, &C_cols[c_col_it] ); } } + +#endif } template @@ -483,6 +492,7 @@ count_non_zeroes_RAP_ext_kernel( const int RAP_int_num_rows, int num_neighbors, int global_id ) { +#if __CUDA_ARCH__ >= 350 const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; @@ -571,6 +581,8 @@ count_non_zeroes_RAP_ext_kernel( const int RAP_int_num_rows, set.store( count, &RAP_cols[rap_col_it] ); } } + +#endif } @@ -608,6 +620,7 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows, int *wk_work_queue, int *wk_status ) { +#if __CUDA_ARCH__ >= 350 const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // Tables to broadcast values. __shared__ volatile int s_b_rows[CTA_SIZE], s_b_colors[CTA_SIZE]; @@ -768,6 +781,8 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows, set.store( count, &C_cols[c_col_it] ); } } + +#endif } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -786,6 +801,7 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows, int *wk_work_queue, int *wk_status ) { +#if __CUDA_ARCH__ >= 350 const int NUM_WARPS = CTA_SIZE / WARP_SIZE; const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; // Tables to broadcast values. @@ -959,6 +975,8 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows, set.store( count, &C_cols[c_col_it] ); } } + +#endif } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -986,6 +1004,7 @@ compute_values_kernel( const int A_num_rows, int *wk_work_queue, int *wk_status ) { +#if __CUDA_ARCH__ >= 350 const int NUM_WARPS = CTA_SIZE / 32; // The hash keys stored in shared memory. __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; @@ -1114,6 +1133,8 @@ compute_values_kernel( const int A_num_rows, map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] ); } + +#endif } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1141,6 +1162,7 @@ compute_values_kernel( const int A_num_rows, int *wk_work_queue, int *wk_status ) { +#if __CUDA_ARCH__ >= 350 const int NUM_WARPS = CTA_SIZE / WARP_SIZE; const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; // The hash keys stored in shared memory. @@ -1277,6 +1299,8 @@ compute_values_kernel( const int A_num_rows, map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] ); } + +#endif } @@ -1301,6 +1325,7 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows, int num_neighbors, int *wk_status ) { +#if __CUDA_ARCH__ >= 350 const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; @@ -1366,6 +1391,8 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows, map.store( count, &RAP_cols[rap_col_it], &RAP_vals[rap_col_it] ); } + +#endif } diff --git a/base/src/csr_multiply_sm70.cu b/base/src/csr_multiply_sm70.cu deleted file mode 100644 index 7d4063d5..00000000 --- a/base/src/csr_multiply_sm70.cu +++ /dev/null @@ -1,2124 +0,0 @@ -/* Copyright (c) 2013-2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -namespace csr_multiply_sm70 -{ - -#include - -#include -#include // Included inside the namespace to solve name colisions. - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -__global__ -void flag_halo_rows(int *row_ids, int size, int *flagArray, int neighbor, int global_id) -{ - for (int tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < size; tidx += blockDim.x * gridDim.x) - { - int row_id = row_ids[tidx]; - flagArray[row_id] = tidx; - } -} - -__device__ __forceinline__ int get_work( int *queue, int warp_id ) -{ - int offset = -1; - - if ( utils::lane_id() == 0 ) - { - offset = atomicAdd( queue, 1 ); - } - - return utils::shfl( offset, 0 ); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -template< int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY > -__global__ __launch_bounds__( CTA_SIZE ) -void -count_non_zeroes_kernel( const int A_num_rows, - const int *A_rows, - const int *A_cols, - const int *B_rows, - const int *B_cols, - int *C_rows, - int *C_cols, - int *Aq1, - int *Bq1, - int *Aq2, - int *Bq2, - const int gmem_size, - int *g_keys, - int *wk_work_queue, - int *wk_status ) -{ - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; - // The coordinates of the thread inside the CTA/warp. - const int warp_id = utils::warp_id(); - const int lane_id = utils::lane_id(); - // First threads load the row IDs of A needed by the CTA... - int a_row_id = blockIdx.x * NUM_WARPS + warp_id; - // Create local storage for the set. - Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); - - // Loop over rows of A. - for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) ) - { - int c_row_id = a_row_id; - - if (Aq1 != NULL) - { - a_row_id = Aq1[a_row_id]; - } - - // Make sure we have to proceed. - if ( COUNT_ONLY ) - { - volatile int *status = reinterpret_cast( wk_status ); - - if ( set.has_failed() || *status != 0 ) - { - return; - } - } - - // Clear the set. - set.clear(); - // Load the range of the row. - int a_col_tmp = -1; - - if ( lane_id < 2 ) - { - a_col_tmp = utils::Ld::load( &A_rows[a_row_id + lane_id] ); - } - - int a_col_it = utils::shfl( a_col_tmp, 0 ); - int a_col_end = utils::shfl( a_col_tmp, 1 ); - - // Iterate over the columns of A. - for ( a_col_it += lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE ) - { - // Is it an active thread. - const bool is_active = a_col_it < a_col_end; - // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID. - int b_row_id = -1; - - if ( is_active ) - { - b_row_id = utils::Ld::load( &A_cols[a_col_it] ); - - //b_row_id is actually column of A - if (Aq2 != NULL) - { - b_row_id = Aq2[b_row_id]; - } - - if (Bq1 != NULL) - { - b_row_id = Bq1[b_row_id]; - } - } - - // The number of valid rows. - const int num_rows = __popc( utils::ballot(is_active) ); - - // Uniform loop: threads collaborate to load other elements. - for ( int k = 0 ; k < num_rows ; ++k ) - { - // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd). - const int uniform_b_row_id = utils::shfl( b_row_id, k ); - // Load the range of the row of B. - int b_col_tmp = -1; - - if ( lane_id < 2 ) - { - b_col_tmp = utils::Ld::load( &B_rows[uniform_b_row_id + lane_id] ); - } - - int b_col_it = utils::shfl( b_col_tmp, 0 ); - int b_col_end = utils::shfl( b_col_tmp, 1 ); - - // Iterate over the range of columns of B. - for ( b_col_it += lane_id ; utils::any(b_col_it < b_col_end) ; b_col_it += WARP_SIZE ) - { - int b_col_id = -1; - - if ( b_col_it < b_col_end ) - { - b_col_id = utils::Ld::load( &B_cols[b_col_it] ); - - // b_col_id is actually column of B - if (Bq2 != NULL) - { - b_col_id = Bq2[b_col_id]; - } - } - - set.insert( b_col_id, COUNT_ONLY ? wk_status : NULL ); - } - } - } - - // Store the results. - if ( COUNT_ONLY ) - { - int count = set.compute_size(); - - if ( lane_id == 0 ) - { - C_rows[c_row_id] = count; - } - } - else - { - int c_col_tmp = -1; - - if ( lane_id < 2 ) - { - c_col_tmp = utils::Ld::load( &C_rows[c_row_id + lane_id] ); - } - - int c_col_it = utils::shfl( c_col_tmp, 0 ); - int c_col_end = utils::shfl( c_col_tmp, 1 ); - // Store the results. - int count = c_col_end - c_col_it; - - if ( count == 0 ) - { - continue; - } - - set.store( count, &C_cols[c_col_it] ); - } - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -template< int NUM_THREADS_PER_ROW, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY > -__global__ __launch_bounds__( CTA_SIZE ) -void -count_non_zeroes_kernel( const int A_num_rows, - const int *__restrict A_rows, - const int *__restrict A_cols, - const int *__restrict B_rows, - const int *__restrict B_cols, - int *__restrict C_rows, - int *__restrict C_cols, - int *Aq1, - int *Bq1, - int *Aq2, - int *Bq2, - const int gmem_size, - int *g_keys, - int *wk_work_queue, - int *wk_status ) -{ - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; - // The hash keys stored in shared memory. - __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; - // The coordinates of the thread inside the CTA/warp. - const int warp_id = utils::warp_id( ); - const int lane_id = utils::lane_id( ); - // Constants. - const int lane_id_div_num_threads = lane_id / NUM_THREADS_PER_ROW; - const int lane_id_mod_num_threads = lane_id % NUM_THREADS_PER_ROW; - // First threads load the row IDs of A needed by the CTA... - int a_row_id = blockIdx.x * NUM_WARPS + warp_id; - // Create local storage for the set. - Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); - - // Loop over rows of A. - for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) ) - { - int c_row_id = a_row_id; - - if (Aq1 != NULL) - { - a_row_id = Aq1[a_row_id]; - } - - // Make sure we have to proceed. - if ( COUNT_ONLY ) - { - volatile int *status = reinterpret_cast( wk_status ); - - if ( set.has_failed() || *status != 0 ) - { - return; - } - } - - // Clear the set. - set.clear(); - // Load the range of the row. - int a_col_tmp = -1; - - if ( lane_id < 2 ) - { - a_col_tmp = utils::Ld::load( &A_rows[a_row_id + lane_id] ); - } - - int a_col_it = utils::shfl( a_col_tmp, 0 ); - int a_col_end = utils::shfl( a_col_tmp, 1 ); - - // Iterate over the columns of A. - for ( a_col_it += lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE ) - { - // Is it an active thread. - const bool is_active = a_col_it < a_col_end; - // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID. - int b_row_id = -1; - - if ( is_active ) - { - b_row_id = utils::Ld::load( &A_cols[a_col_it] ); - - //b_row_id is actually column of A - if (Aq2 != NULL) - { - b_row_id = Aq2[b_row_id]; - } - - if (Bq1 != NULL) - { - b_row_id = Bq1[b_row_id]; - } - } - - const int num_rows = __popc( utils::ballot(is_active) ); - - // Uniform loop: threads collaborate to load other elements. - for ( int k = 0 ; k < num_rows ; k += NUM_LOADED_ROWS ) - { - int local_k = k + lane_id_div_num_threads; - // Is it an active thread. - bool is_active_k = local_k < num_rows; - // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd). - const int uniform_b_row_id = utils::shfl( b_row_id, local_k ); - // Load the range of the row of B. - int b_col_tmp = -1; - - if ( is_active_k && lane_id_mod_num_threads < 2 ) - { - b_col_tmp = utils::Ld::load( &B_rows[uniform_b_row_id + lane_id_mod_num_threads] ); - } - - int b_col_it = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 0 ); - int b_col_end = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 1 ); - - // Iterate over the range of columns of B. - for ( b_col_it += lane_id_mod_num_threads ; utils::any(b_col_it < b_col_end) ; b_col_it += NUM_THREADS_PER_ROW ) - { - int b_col_id = -1; - - if ( b_col_it < b_col_end ) - { - b_col_id = utils::Ld::load( &B_cols[b_col_it] ); - - // b_col_id is actually column of B - if (Bq2 != NULL) - { - b_col_id = Bq2[b_col_id]; - } - } - - set.insert( b_col_id, COUNT_ONLY ? wk_status : NULL ); - } - } - } - - // Store the results. - if ( COUNT_ONLY ) - { - int count = set.compute_size_with_duplicates(); - - if ( lane_id == 0 ) - { - C_rows[c_row_id] = count; - } - } - else - { - int c_col_tmp = -1; - - if ( lane_id < 2 ) - { - c_col_tmp = utils::Ld::load( &C_rows[c_row_id + lane_id] ); - } - - int c_col_it = utils::shfl( c_col_tmp, 0 ); - int c_col_end = utils::shfl( c_col_tmp, 1 ); - // Store the results. - int count = c_col_end - c_col_it; - - if ( count == 0 ) - { - continue; - } - - set.store( count, &C_cols[c_col_it] ); - } - } -} - -template -__device__ __forceinline__ void sparse_add_process_row(int row_id, const int *__restrict__ row_offsets, const int *__restrict__ col_indices, int lane_id, Hash_set &set, int *wk_status, int global_id, bool print_flag) -{ - // Load the range of the row of RAP_int - int col_tmp = -1; - - if ( lane_id < 2 ) - { - col_tmp = utils::Ld::load( &row_offsets[row_id + lane_id] ); - } - - int col_it = utils::shfl( col_tmp, 0 ); - int col_end = utils::shfl( col_tmp, 1 ); - - // Iterate over the columns of RAP_int - for ( col_it += lane_id ; utils::any(col_it < col_end) ; col_it += WARP_SIZE ) - { - int col_id = -1; - - if ( col_it < col_end ) - { - col_id = utils::Ld::load( &col_indices[col_it] ); - } - - set.insert( col_id, COUNT_ONLY ? wk_status : NULL ); - } -} - - -template -__device__ __forceinline__ void sparse_add_process_row_values(int row_id, const int *__restrict__ row_offsets, const int *__restrict__ col_indices, const Value_type *__restrict vals, int lane_id, Hash_map &map, int *wk_status) -{ - // Load the range of the row. - int col_tmp = -1; - - if ( lane_id < 2 ) - { - col_tmp = utils::Ld::load( &row_offsets[row_id + lane_id] ); - } - - int col_it = utils::shfl( col_tmp, 0 ); - int col_end = utils::shfl( col_tmp, 1 ); - - // Iterate over the columns of A. - for ( col_it += lane_id ; utils::any(col_it < col_end) ; col_it += WARP_SIZE ) - { - const bool is_active = col_it < col_end; - // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID. - int col_id(-1); - Value_type value = amgx::types::util::get_zero(); - - if ( is_active ) - { - col_id = utils::Ld::load( &col_indices[col_it] ); - value = utils::Ld::load( &vals[col_it] ); - } - - map.insert_with_duplicates( col_id, value, wk_status ); - } -} - - - - -template -__global__ __launch_bounds__( CTA_SIZE ) -void -count_non_zeroes_RAP_ext_kernel( const int RAP_int_num_rows, - const int *__restrict RAP_int_rows, - const int *__restrict RAP_int_cols, - int **RAP_ext_row_ptrs, - int **RAP_ext_col_ptrs, - int *__restrict RAP_rows, - int *__restrict RAP_cols, - int **flagArray_ptrs, - const int gmem_size, - int *g_keys, - int *wk_work_queue, - int *wk_status, - int num_neighbors, - int global_id ) -{ - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // The hash keys stored in shared memory. - __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; - // The coordinates of the thread inside the CTA/warp. - const int warp_id = utils::warp_id( ); - const int lane_id = utils::lane_id( ); - // First threads load the row IDs of A needed by the CTA... - int rap_int_row_id = blockIdx.x * NUM_WARPS + warp_id; - // Create local storage for the set. - Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[rap_int_row_id * gmem_size], gmem_size ); - - // Loop over rows of RAP_ext - for ( ; rap_int_row_id < RAP_int_num_rows; rap_int_row_id = get_work( wk_work_queue, warp_id ) ) - { - // Make sure we have to proceed. - if ( COUNT_ONLY ) - { - volatile int *status = reinterpret_cast( wk_status ); - - if ( set.has_failed() || *status != 0 ) - { - return; - } - } - - // Clear the set. - set.clear(); - // --------------------------------- - // First process RAP_int - // --------------------------------- - bool print_flag = false; - - if (rap_int_row_id == 3 && global_id == 0) - { - print_flag = true; - } - - sparse_add_process_row(rap_int_row_id, RAP_int_rows, RAP_int_cols, lane_id, set, wk_status, global_id, print_flag); - - // --------------------------------- - // Then process RAP_ext - // --------------------------------- - - for (int i = 0; i < num_neighbors; i++) - { - int flag = flagArray_ptrs[i][rap_int_row_id]; - - if (flag != -1) - { - int *RAP_ext_rows = RAP_ext_row_ptrs[i]; - int *RAP_ext_cols = RAP_ext_col_ptrs[i]; - int pos_in_row_ext = flag; - sparse_add_process_row(pos_in_row_ext, RAP_ext_rows, RAP_ext_cols, lane_id, set, wk_status, global_id, print_flag); - } - } - - // Store the results. - if ( COUNT_ONLY ) - { - int count = set.compute_size_with_duplicates(); - - if ( lane_id == 0 ) - { - RAP_rows[rap_int_row_id] = count; - } - } - else - { - int rap_col_tmp = -1; - - if ( lane_id < 2 ) - { - rap_col_tmp = utils::Ld::load( &RAP_rows[rap_int_row_id + lane_id] ); - } - - int rap_col_it = utils::shfl( rap_col_tmp, 0 ); - int rap_col_end = utils::shfl( rap_col_tmp, 1 ); - // Store the results. - int count = rap_col_end - rap_col_it; - - if ( count == 0 ) - { - continue; - } - - set.store( count, &RAP_cols[rap_col_it] ); - } - } -} - - - - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -struct Without_external_diag -{ - static __device__ __forceinline__ bool is_active ( int a_col_it, int a_col_end ) { return a_col_it < a_col_end; } - static __device__ __forceinline__ bool is_boundary( int a_col_it, int a_col_end ) { return false; } -}; - -// ==================================================================================================================== - -struct With_external_diag -{ - static __device__ __forceinline__ bool is_active ( int a_col_it, int a_col_end ) { return a_col_it <= a_col_end; } - static __device__ __forceinline__ bool is_boundary( int a_col_it, int a_col_end ) { return a_col_it == a_col_end; } -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -template< int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY, typename Diag_traits > -__global__ __launch_bounds__( CTA_SIZE ) -void -count_non_zeroes_ilu1_kernel( const int A_num_rows, - const int *__restrict A_rows, - const int *__restrict A_cols, - const int *__restrict A_coloring, - int *__restrict C_rows, - int *__restrict C_cols, - const int gmem_size, - int *g_keys, - int *wk_work_queue, - int *wk_status ) -{ - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // Tables to broadcast values. - __shared__ volatile int s_b_rows[CTA_SIZE], s_b_colors[CTA_SIZE]; - // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; - // The coordinates of the thread inside the CTA/warp. - const int warp_id = utils::warp_id( ); - const int lane_id = utils::lane_id( ); - // First threads load the row IDs of A needed by the CTA... - int a_row_id = blockIdx.x * NUM_WARPS + warp_id; - // Create local storage for the set. - Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); - - // Loop over rows of A. - for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) ) - { - // Make sure we have to proceed. - if ( COUNT_ONLY ) - { - volatile int *status = reinterpret_cast( wk_status ); - - if ( set.has_failed() || *status != 0 ) - { - return; - } - } - - // Clear the set. - set.clear(); - // The color of the row. - int a_row_color = A_coloring[a_row_id]; - // Load the range of the row. - int a_col_tmp = -1; - - if ( lane_id < 2 ) - { - a_col_tmp = utils::Ld::load( &A_rows[a_row_id + lane_id] ); - } - - int a_col_it = utils::shfl( a_col_tmp, 0 ); - int a_col_end = utils::shfl( a_col_tmp, 1 ); - - // Iterate over the columns of A. - for ( a_col_it += lane_id ; utils::any(Diag_traits::is_active(a_col_it, a_col_end)) ; a_col_it += WARP_SIZE ) - { - // Is it an active thread. - const bool is_active = a_col_it < a_col_end; - // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID. - int b_row_id = -1; - - if ( is_active ) - { - b_row_id = utils::Ld::load( &A_cols[a_col_it] ); - } - - if ( Diag_traits::is_boundary(a_col_it, a_col_end) ) - { - b_row_id = a_row_id; - } - - // Push the columns in the set. - set.insert( b_row_id, COUNT_ONLY ? wk_status : NULL ); - - // Skip computation if the color of the row is 0. - if ( a_row_color != 0 ) - { - // Gather the colors of the columns. - int b_row_color = -1; - - if ( is_active ) - { - b_row_color = A_coloring[b_row_id]; - } - - // The number of valid rows. - int pred = is_active && b_row_color < a_row_color; - int vote = utils::ballot( pred ); - int dest = __popc( vote & utils::lane_mask_lt() ); - - if ( pred ) - { - s_b_rows [warp_id * WARP_SIZE + dest] = b_row_id; - s_b_colors[warp_id * WARP_SIZE + dest] = b_row_color; // TODO: store an int2 rather than 2 ints of SM35 with 64bit banks. - } - - const int num_rows = __popc( vote ); - - // Uniform loop: threads collaborate to load other elements. - for ( int k = 0 ; k < num_rows ; ++k ) - { - // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd). - const int uniform_b_row_id = s_b_rows [warp_id * WARP_SIZE + k]; - const int uniform_b_color = s_b_colors[warp_id * WARP_SIZE + k]; - // Load the range of the row of B. - int b_col_tmp = -1; - - if ( lane_id < 2 ) - { - b_col_tmp = utils::Ld::load( &A_rows[uniform_b_row_id + lane_id] ); - } - - int b_col_it = utils::shfl( b_col_tmp, 0 ); - int b_col_end = utils::shfl( b_col_tmp, 1 ); - - // Iterate over the range of columns of B. - for ( b_col_it += lane_id ; utils::any(b_col_it < b_col_end) ; b_col_it += WARP_SIZE ) - { - int b_col_id = -1, b_col_color = -1; - - if ( b_col_it < b_col_end ) - { - b_col_id = utils::Ld::load( &A_cols[b_col_it] ); - b_col_color = utils::Ld::load( &A_coloring[b_col_id] ); - } - - int item = -1; - - if ( b_col_color >= uniform_b_color && b_col_color != a_row_color ) - { - item = b_col_id; - } - - set.insert( item, COUNT_ONLY ? wk_status : NULL ); - } - } - } - } - - // Store the results. - if ( COUNT_ONLY ) - { - int count = set.compute_size(); - - if ( lane_id == 0 ) - { - C_rows[a_row_id] = count; - } - } - else - { - int c_col_tmp = -1; - - if ( lane_id < 2 ) - { - c_col_tmp = utils::Ld::load( &C_rows[a_row_id + lane_id] ); - } - - int c_col_it = utils::shfl( c_col_tmp, 0 ); - int c_col_end = utils::shfl( c_col_tmp, 1 ); - // Store the results. - int count = c_col_end - c_col_it; - - if ( count == 0 ) - { - continue; - } - - set.store( count, &C_cols[c_col_it] ); - } - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -template< int NUM_THREADS_PER_ROW, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY, typename Diag_traits > -__global__ __launch_bounds__( CTA_SIZE ) -void -count_non_zeroes_ilu1_kernel( const int A_num_rows, - const int *__restrict A_rows, - const int *__restrict A_cols, - const int *__restrict A_coloring, - int *__restrict C_rows, - int *__restrict C_cols, - const int gmem_size, - int *g_keys, - int *wk_work_queue, - int *wk_status ) -{ - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; - // Tables to broadcast values. - __shared__ volatile int s_b_rows[CTA_SIZE], s_b_colors[CTA_SIZE]; - // The hash keys stored in shared memory. - __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; - // The coordinates of the thread inside the CTA/warp. - const int warp_id = utils::warp_id( ); - const int lane_id = utils::lane_id( ); - // Constants. - const int lane_id_div_num_threads = lane_id / NUM_THREADS_PER_ROW; - const int lane_id_mod_num_threads = lane_id % NUM_THREADS_PER_ROW; - // First threads load the row IDs of A needed by the CTA... - int a_row_id = blockIdx.x * NUM_WARPS + warp_id; - // Create local storage for the set. - Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); - - // Loop over rows of A. - for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) ) - { - // Make sure we have to proceed. - if ( COUNT_ONLY ) - { - volatile int *status = reinterpret_cast( wk_status ); - - if ( set.has_failed() || *status != 0 ) - { - return; - } - } - - // Clear the set. - set.clear(); - // The color of the row. - int a_row_color = A_coloring[a_row_id]; - // Load the range of the row. - int a_col_tmp = -1; - - if ( lane_id < 2 ) - { - a_col_tmp = utils::Ld::load( &A_rows[a_row_id + lane_id] ); - } - - int a_col_it = utils::shfl( a_col_tmp, 0 ); - int a_col_end = utils::shfl( a_col_tmp, 1 ); - - // Iterate over the columns of A. - for ( a_col_it += lane_id ; utils::any(Diag_traits::is_active(a_col_it, a_col_end)) ; a_col_it += WARP_SIZE ) - { - // Is it an active thread. - const bool is_active = a_col_it < a_col_end; - // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID. - int b_row_id = -1; - - if ( is_active ) - { - b_row_id = utils::Ld::load( &A_cols[a_col_it] ); - } - - if ( Diag_traits::is_boundary(a_col_it, a_col_end) ) - { - b_row_id = a_row_id; - } - - // Push the columns in the set. - set.insert( b_row_id, COUNT_ONLY ? wk_status : NULL ); - - // Skip computation if the color of the row is 0. - if ( a_row_color != 0 ) - { - // Gather the colors of the columns. - int b_row_color = -1; - - if ( is_active ) - { - b_row_color = A_coloring[b_row_id]; - } - - // The number of valid rows. - int pred = is_active && b_row_color < a_row_color; - int vote = utils::ballot( pred ); - int dest = __popc( vote & utils::lane_mask_lt() ); - - if ( pred ) - { - s_b_rows [warp_id * WARP_SIZE + dest] = b_row_id; - s_b_colors[warp_id * WARP_SIZE + dest] = b_row_color; - } - - const int num_rows = __popc( vote ); - - // Uniform loop: threads collaborate to load other elements. - for ( int k = 0 ; k < num_rows ; k += NUM_LOADED_ROWS ) - { - int local_k = k + lane_id_div_num_threads; - // Is it an active thread. - bool is_active_k = local_k < num_rows; - // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd). - int uniform_b_row_id = -1, uniform_b_color = -1; - - if ( is_active_k ) - { - uniform_b_row_id = s_b_rows [warp_id * WARP_SIZE + local_k]; - uniform_b_color = s_b_colors[warp_id * WARP_SIZE + local_k]; - } - - // Load the range of the row of B. - int b_col_tmp = -1; - - if ( is_active_k && lane_id_mod_num_threads < 2 ) - { - b_col_tmp = utils::Ld::load( &A_rows[uniform_b_row_id + lane_id_mod_num_threads] ); - } - - int b_col_it = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 0 ); - int b_col_end = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 1 ); - - // Iterate over the range of columns of B. - for ( b_col_it += lane_id_mod_num_threads ; utils::any(b_col_it < b_col_end) ; b_col_it += NUM_THREADS_PER_ROW ) - { - int b_col_id = -1, b_col_color = -1; - - if ( b_col_it < b_col_end ) - { - b_col_id = utils::Ld::load( &A_cols[b_col_it] ); - b_col_color = utils::Ld::load( &A_coloring[b_col_id] ); - } - - int item = -1; - - if ( b_col_color >= uniform_b_color && b_col_color != a_row_color ) - { - item = b_col_id; - } - - set.insert( item, COUNT_ONLY ? wk_status : NULL ); - } - } - } - } - - // Store the results. - if ( COUNT_ONLY ) - { - int count = set.compute_size_with_duplicates(); - - if ( lane_id == 0 ) - { - C_rows[a_row_id] = count; - } - } - else - { - int c_col_tmp = -1; - - if ( lane_id < 2 ) - { - c_col_tmp = utils::Ld::load( &C_rows[a_row_id + lane_id] ); - } - - int c_col_it = utils::shfl( c_col_tmp, 0 ); - int c_col_end = utils::shfl( c_col_tmp, 1 ); - // Store the results. - int count = c_col_end - c_col_it; - - if ( count == 0 ) - { - continue; - } - - set.store( count, &C_cols[c_col_it] ); - } - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -template< typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE > -__global__ __launch_bounds__( CTA_SIZE, 6 ) -void -compute_values_kernel( const int A_num_rows, - const int *__restrict A_rows, - const int *__restrict A_cols, - const Value_type *__restrict A_vals, - const int *__restrict B_rows, - const int *__restrict B_cols, - const Value_type *__restrict B_vals, - const int *__restrict C_rows, - int *__restrict C_cols, - Value_type *__restrict C_vals, - int *Aq1, - int *Bq1, - int *Aq2, - int *Bq2, - const int gmem_size, - int *g_keys, - Value_type *g_vals, - int *wk_work_queue, - int *wk_status ) -{ - const int NUM_WARPS = CTA_SIZE / 32; - // The hash keys stored in shared memory. - __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; - // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; - // The coordinates of the thread inside the CTA/warp. - const int warp_id = utils::warp_id(); - const int lane_id = utils::lane_id(); - // First threads load the row IDs of A needed by the CTA... - int a_row_id = blockIdx.x * NUM_WARPS + warp_id; - // Create local storage for the set. - Hash_map map( &s_keys[warp_id * SMEM_SIZE], - &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], - &g_vals[a_row_id * gmem_size], - gmem_size ); - - // Loop over rows of A. - for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) ) - { - int c_row_id = a_row_id; - - if (Aq1 != NULL) - { - a_row_id = Aq1[a_row_id]; - } - - // Clear the map. - map.clear(); - // Load the range of the row. - int a_col_tmp = -1; - - if ( lane_id < 2 ) - { - a_col_tmp = utils::Ld::load( &A_rows[a_row_id + lane_id] ); - } - - int a_col_it = utils::shfl( a_col_tmp, 0 ); - int a_col_end = utils::shfl( a_col_tmp, 1 ); - - // Iterate over the columns of A. - for ( a_col_it += lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE ) - { - // Is it an active thread. - const bool is_active = a_col_it < a_col_end; - // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID. - int b_row_id = -1; - Value_type a_value = amgx::types::util::get_zero(); - - if ( is_active ) - { - b_row_id = utils::Ld::load( &A_cols[a_col_it] ); - a_value = utils::Ld::load( &A_vals[a_col_it] ); - - //b_row_id is actually column of A - if (Aq2 != NULL) - { - b_row_id = Aq2[b_row_id]; - } - - if (Bq1 != NULL) - { - b_row_id = Bq1[b_row_id]; - } - } - - const int num_rows = __popc( utils::ballot(is_active) ); - - // Uniform loop: threads collaborate to load other elements. - for ( int k = 0 ; k < num_rows ; ++k ) - { - // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd). - const int uniform_b_row_id = utils::shfl( b_row_id, k ); - // The value of A. - const Value_type uniform_a_value = utils::shfl( a_value, k ); - // Load the range of the row of B. - int b_col_tmp = -1; - - if ( lane_id < 2 ) - { - b_col_tmp = utils::Ld::load( &B_rows[uniform_b_row_id + lane_id] ); - } - - int b_col_it = utils::shfl( b_col_tmp, 0 ); - int b_col_end = utils::shfl( b_col_tmp, 1 ); - - // Iterate over the range of columns of B. - for ( b_col_it += lane_id ; utils::any(b_col_it < b_col_end) ; b_col_it += WARP_SIZE ) - { - int b_col_id = -1; - Value_type b_value = amgx::types::util::get_zero(); - - if ( b_col_it < b_col_end ) - { - b_col_id = utils::Ld::load( &B_cols[b_col_it] ); - b_value = utils::Ld::load( &B_vals[b_col_it] ); - - if (Bq2 != NULL) - { - b_col_id = Bq2[b_col_id]; - } - } - - map.insert( b_col_id, uniform_a_value, b_value, wk_status ); - } - } - } - - // Store the results. - int c_col_tmp = -1; - - if ( lane_id < 2 ) - { - c_col_tmp = utils::Ld::load( &C_rows[c_row_id + lane_id] ); - } - - int c_col_it = utils::shfl( c_col_tmp, 0 ); - int c_col_end = utils::shfl( c_col_tmp, 1 ); - // Store the results. - int count = c_col_end - c_col_it; - - if ( count == 0 ) - { - continue; - } - - map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] ); - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -template< int NUM_THREADS_PER_ROW, typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE > -__global__ __launch_bounds__( CTA_SIZE, 6 ) -void -compute_values_kernel( const int A_num_rows, - const int *__restrict A_rows, - const int *__restrict A_cols, - const Value_type *__restrict A_vals, - const int *__restrict B_rows, - const int *__restrict B_cols, - const Value_type *__restrict B_vals, - const int *__restrict C_rows, - int *__restrict C_cols, - Value_type *__restrict C_vals, - int *Aq1, - int *Bq1, - int *Aq2, - int *Bq2, - const int gmem_size, - int *g_keys, - Value_type *g_vals, - int *wk_work_queue, - int *wk_status ) -{ - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; - // The hash keys stored in shared memory. - __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; - // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; - // The coordinates of the thread inside the CTA/warp. - const int warp_id = utils::warp_id( ); - const int lane_id = utils::lane_id( ); - // Constants. - const int lane_id_div_num_threads = lane_id / NUM_THREADS_PER_ROW; - const int lane_id_mod_num_threads = lane_id % NUM_THREADS_PER_ROW; - // First threads load the row IDs of A needed by the CTA... - int a_row_id = blockIdx.x * NUM_WARPS + warp_id; - // Create local storage for the set. - Hash_map map( &s_keys[warp_id * SMEM_SIZE], - &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], - &g_vals[a_row_id * gmem_size], - gmem_size ); - - // Loop over rows of A. - for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) ) - { - int c_row_id = a_row_id; - - if (Aq1 != NULL) - { - a_row_id = Aq1[a_row_id]; - } - - // Clear the map. - map.clear_all(); - // Load the range of the row. - int a_col_tmp = -1; - - if ( lane_id < 2 ) - { - a_col_tmp = utils::Ld::load( &A_rows[a_row_id + lane_id] ); - } - - int a_col_it = utils::shfl( a_col_tmp, 0 ); - int a_col_end = utils::shfl( a_col_tmp, 1 ); - - // Iterate over the columns of A. - for ( a_col_it += lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE ) - { - // Is it an active thread. - const bool is_active = a_col_it < a_col_end; - // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID. - int b_row_id(-1); - Value_type a_value = amgx::types::util::get_zero(); - - if ( is_active ) - { - b_row_id = utils::Ld::load( &A_cols[a_col_it] ); - a_value = utils::Ld::load( &A_vals[a_col_it] ); - - //b_row_id is actually column of A - if (Aq2 != NULL) - { - b_row_id = Aq2[b_row_id]; - } - - if (Bq1 != NULL) - { - b_row_id = Bq1[b_row_id]; - } - } - - const int num_rows = __popc( utils::ballot(is_active) ); - - // Uniform loop: threads collaborate to load other elements. - for ( int k = 0 ; k < num_rows ; k += NUM_LOADED_ROWS ) - { - int local_k = k + lane_id_div_num_threads; - // Is it an active thread. - bool is_active_k = local_k < num_rows; - // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd). - const int uniform_b_row_id = utils::shfl( b_row_id, k + lane_id_div_num_threads ); - // The value of A. - const Value_type uniform_a_value = utils::shfl( a_value, k + lane_id_div_num_threads ); - // Load the range of the row of B. - int b_col_tmp = -1; - - if ( is_active_k && lane_id_mod_num_threads < 2 ) - { - b_col_tmp = utils::Ld::load( &B_rows[uniform_b_row_id + lane_id_mod_num_threads] ); - } - - int b_col_it = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 0 ); - int b_col_end = utils::shfl( b_col_tmp, lane_id_div_num_threads * NUM_THREADS_PER_ROW + 1 ); - - // Iterate over the range of columns of B. - for ( b_col_it += lane_id_mod_num_threads ; utils::any(b_col_it < b_col_end) ; b_col_it += NUM_THREADS_PER_ROW ) - { - int b_col_id(-1); - Value_type b_value = amgx::types::util::get_zero(); - - if ( b_col_it < b_col_end ) - { - b_col_id = utils::Ld::load( &B_cols[b_col_it] ); - b_value = utils::Ld::load( &B_vals[b_col_it] ); - - //b_col_id is actually column of B - if (Bq2 != NULL) - { - b_col_id = Bq2[b_col_id]; - } - } - - map.insert_with_duplicates( b_col_id, uniform_a_value * b_value, wk_status ); - } - } - } - - // Store the results. - int c_col_tmp = -1; - - if ( lane_id < 2 ) - { - c_col_tmp = utils::Ld::load( &C_rows[c_row_id + lane_id] ); - } - - int c_col_it = utils::shfl( c_col_tmp, 0 ); - int c_col_end = utils::shfl( c_col_tmp, 1 ); - // Store the results. - int count = c_col_end - c_col_it; - - if ( count == 0 ) - { - continue; - } - - map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] ); - } -} - - -template< typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE > -__global__ __launch_bounds__( CTA_SIZE, 6 ) -void -compute_values_RAP_ext_kernel( const int RAP_int_num_rows, - const int *__restrict RAP_int_rows, - const int *__restrict RAP_int_cols, - const Value_type *__restrict RAP_int_vals, - int **RAP_ext_row_ptrs, - int **RAP_ext_col_ptrs, - Value_type **RAP_ext_val_ptrs, - int *__restrict RAP_rows, - int *__restrict RAP_cols, - Value_type *__restrict RAP_vals, - int **flagArray_ptrs, - const int gmem_size, - int *g_keys, - Value_type *g_vals, - int *wk_work_queue, - int num_neighbors, - int *wk_status ) -{ - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // The hash keys stored in shared memory. - __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; - // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; - // The coordinates of the thread inside the CTA/warp. - const int warp_id = utils::warp_id( ); - const int lane_id = utils::lane_id( ); - // First threads load the row IDs of A needed by the CTA... - int rap_int_row_id = blockIdx.x * NUM_WARPS + warp_id; - // Create local storage for the set. - Hash_map map( &s_keys[warp_id * SMEM_SIZE], - &g_keys[rap_int_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], - &g_vals[rap_int_row_id * gmem_size], - gmem_size ); - - // Loop over rows of RAP_ext - for ( ; rap_int_row_id < RAP_int_num_rows ; rap_int_row_id = get_work( wk_work_queue, warp_id ) ) - { - // Clear the map. - map.clear_all(); - // --------------------------------- - // First process RAP_int - // --------------------------------- - sparse_add_process_row_values(rap_int_row_id, RAP_int_rows, RAP_int_cols, RAP_int_vals, lane_id, map, wk_status); - - // --------------------------------- - // Then process RAP_ext - // --------------------------------- - - for (int i = 0; i < num_neighbors; i++) - { - int flag = flagArray_ptrs[i][rap_int_row_id]; - - if (flag != -1) - { - int *RAP_ext_rows = RAP_ext_row_ptrs[i]; - int *RAP_ext_cols = RAP_ext_col_ptrs[i]; - Value_type *RAP_ext_vals = RAP_ext_val_ptrs[i]; - int pos_in_row_ext = flag; - sparse_add_process_row_values(pos_in_row_ext, RAP_ext_rows, RAP_ext_cols, RAP_ext_vals, lane_id, map, wk_status); - } - } - - // Store the results. - int rap_col_tmp = -1; - - if ( lane_id < 2 ) - { - rap_col_tmp = utils::Ld::load( &RAP_rows[rap_int_row_id + lane_id] ); - } - - int rap_col_it = utils::shfl( rap_col_tmp, 0 ); - int rap_col_end = utils::shfl( rap_col_tmp, 1 ); - // Store the results. - int count = rap_col_end - rap_col_it; - - if ( count == 0 ) - { - continue; - } - - map.store( count, &RAP_cols[rap_col_it], &RAP_vals[rap_col_it] ); - } -} - - - - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace csr_multiply_sm70 - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -namespace amgx -{ - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -enum { WARP_SIZE = 32, SMEM_SIZE = 128 }; - -// ==================================================================================================================== - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -CSR_Multiply_Sm70 >::CSR_Multiply_Sm70( bool allocate_values, int grid_size, int max_warp_count, int gmem_size ) : - Base(allocate_values, grid_size, max_warp_count, gmem_size) -{} - -// ==================================================================================================================== - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -void CSR_Multiply_Sm70 >::count_non_zeroes( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2 ) -{ - const int GRID_SIZE = 128; - const int CTA_SIZE = 256; - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // Reset work queue. - int work_offset = GRID_SIZE * NUM_WARPS; - CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); - - // Compute non-zero elements. - switch ( this->m_num_threads_per_row_count ) - { - case 2: - csr_multiply_sm70::count_non_zeroes_kernel< 2, CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - NULL, - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - this->m_status ); - break; - - case 4: - csr_multiply_sm70::count_non_zeroes_kernel< 4, CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - NULL, - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - this->m_status ); - break; - - case 8: - csr_multiply_sm70::count_non_zeroes_kernel< 8, CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - NULL, - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - this->m_status ); - break; - - case 16: - csr_multiply_sm70::count_non_zeroes_kernel<16, CTA_SIZE, SMEM_SIZE, WARP_SIZE, true> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - NULL, - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - this->m_status ); - break; - - default: - csr_multiply_sm70::count_non_zeroes_kernel <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - NULL, - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - this->m_status ); - } - - cudaCheckError(); - //CUDA_SAFE_CALL( cudaGetLastError() ); -} - - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -void CSR_Multiply_Sm70 >::count_non_zeroes_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector &RAP_ext_row_offsets, std::vector &RAP_ext_col_indices, std::vector &RAP_ext_values, std::vector &RAP_ext_row_ids) - -{ - const int GRID_SIZE = 128; - const int CTA_SIZE = 256; - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // Reset work queue. - int work_offset = GRID_SIZE * NUM_WARPS; - CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); - // This is num_owned_coarse_rows - int RAP_size = RAP.get_num_rows(); - int RAP_int_size = RAP_int.row_offsets.size() - 1; - - if (RAP_int_size < RAP_size) - { - FatalError("RAP_int has less rows than RAP, need to modify sparse RAP add to handle that case\n", AMGX_ERR_NOT_IMPLEMENTED); - } - - // Create a device vector of the raw pointers to the array - // RAP_ext_row_ids_ptrs - // RAP_ext_row_offsets_ptrs - // RAP_ext_col_indices_ptrs - // RAP_ext_values_ptrs - int num_neighbors = RAP_ext_row_offsets.size(); - std::vector flagArray(num_neighbors); - - for (int i = 0; i < num_neighbors; i++) - { - flagArray[i].resize(RAP_size); - thrust::fill(flagArray[i].begin(), flagArray[i].end(), -1); - } - - cudaCheckError(); - std::vector flagArray_ptrs_h(num_neighbors); - std::vector RAP_ext_row_offsets_ptrs_h(num_neighbors); - std::vector RAP_ext_col_indices_ptrs_h(num_neighbors); - - for (int i = 0; i < num_neighbors; i++) - { - flagArray_ptrs_h[i] = thrust::raw_pointer_cast(&flagArray[i][0]); - RAP_ext_row_offsets_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_row_offsets[i][0]); - RAP_ext_col_indices_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_col_indices[i][0]); - } - - device_vector_alloc flagArray_ptrs = flagArray_ptrs_h; - device_vector_alloc RAP_ext_row_offsets_ptrs = RAP_ext_row_offsets_ptrs_h; - device_vector_alloc RAP_ext_col_indices_ptrs = RAP_ext_col_indices_ptrs_h; - - for (int i = 0; i < num_neighbors; i++) - { - int size = RAP_ext_row_ids[i].size(); - - if (size != 0) - { - int num_blocks = min(4096, (size + 127) / 128); - //write the position in RAP_ext_row_ids - csr_multiply_sm70::flag_halo_rows <<< num_blocks, 128>>>( - RAP_ext_row_ids[i].raw(), - size, - flagArray[i].raw(), - i, - RAP.manager->global_id()); - } - } - - csr_multiply_sm70::count_non_zeroes_RAP_ext_kernel <<< GRID_SIZE, CTA_SIZE>>>( - RAP_size, - RAP_int.row_offsets.raw(), - RAP_int.col_indices.raw(), - thrust::raw_pointer_cast(&RAP_ext_row_offsets_ptrs[0]), - thrust::raw_pointer_cast(&RAP_ext_col_indices_ptrs[0]), - RAP.row_offsets.raw(), - (int *) NULL, - thrust::raw_pointer_cast(&flagArray_ptrs[0]), - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - this->m_status, - num_neighbors, - RAP.manager->global_id() ); - cudaCheckError(); - //CUDA_SAFE_CALL( cudaGetLastError() ); -} - -// ==================================================================================================================== - -template< int CTA_SIZE, bool COUNT_ONLY, typename Diag_traits, typename Matrix > -static void -count_non_zeroes_ilu1_dispatch( const Matrix &A, Matrix &B, int num_threads_per_row_count, int gmem_size, int *keys, int *work_queue, int *status ) -{ - const int GRID_SIZE = 128; - - switch ( num_threads_per_row_count ) - { - case 2: - csr_multiply_sm70::count_non_zeroes_ilu1_kernel< 2, CTA_SIZE, SMEM_SIZE, WARP_SIZE, COUNT_ONLY, Diag_traits> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ), - B.row_offsets.raw(), - B.col_indices.raw(), - gmem_size, - keys, - work_queue, - status ); - break; - - case 4: - csr_multiply_sm70::count_non_zeroes_ilu1_kernel< 4, CTA_SIZE, SMEM_SIZE, WARP_SIZE, COUNT_ONLY, Diag_traits> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ), - B.row_offsets.raw(), - B.col_indices.raw(), - gmem_size, - keys, - work_queue, - status ); - break; - - case 8: - csr_multiply_sm70::count_non_zeroes_ilu1_kernel< 8, CTA_SIZE, SMEM_SIZE, WARP_SIZE, COUNT_ONLY, Diag_traits> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ), - B.row_offsets.raw(), - B.col_indices.raw(), - gmem_size, - keys, - work_queue, - status ); - break; - - case 16: - csr_multiply_sm70::count_non_zeroes_ilu1_kernel<16, CTA_SIZE, SMEM_SIZE, WARP_SIZE, COUNT_ONLY, Diag_traits> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ), - B.row_offsets.raw(), - B.col_indices.raw(), - gmem_size, - keys, - work_queue, - status ); - break; - - default: - csr_multiply_sm70::count_non_zeroes_ilu1_kernel <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - thrust::raw_pointer_cast( &A.getMatrixColoring().getRowColors()[0] ), - B.row_offsets.raw(), - B.col_indices.raw(), - gmem_size, - keys, - work_queue, - status ); - } - - cudaCheckError(); -} - -// ==================================================================================================================== - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -void CSR_Multiply_Sm70 >::count_non_zeroes_ilu1( const Matrix_d &A, Matrix_d &B ) -{ - const int GRID_SIZE = 128; - - const int CTA_SIZE = 256; - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // Reset work queue. - int work_offset = GRID_SIZE * NUM_WARPS; - CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); - - // Count the number of non zeroes. - if ( A.hasProps(DIAG) ) - count_non_zeroes_ilu1_dispatch( - A, - B, - this->m_num_threads_per_row_count, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - this->m_status ); - else - count_non_zeroes_ilu1_dispatch( - A, - B, - this->m_num_threads_per_row_count, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - this->m_status ); - - // Compute non-zero elements. - CUDA_SAFE_CALL( cudaGetLastError() ); -} - -// ==================================================================================================================== - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -void CSR_Multiply_Sm70 >::compute_offsets( Matrix_d &C ) -{ - thrust::device_ptr offsets_begin(C.row_offsets.raw()); - thrust::device_ptr offsets_end (C.row_offsets.raw() + C.get_num_rows() + 1); - thrust::exclusive_scan( offsets_begin, offsets_end, offsets_begin ); - cudaCheckError(); -} - -// ==================================================================================================================== - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -void CSR_Multiply_Sm70 >::compute_sparsity( const Matrix_d &A, const Matrix_d &B, Matrix_d &C ) -{ - const int GRID_SIZE = 128; - const int CTA_SIZE = 256; - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // std::cerr << "CSR_Multiply_Sm70 >::compute_sparsity" << std::endl; - // Reset the work queue. - int work_offset = GRID_SIZE * NUM_WARPS; - CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); - - // Compute the values. - switch ( this->m_num_threads_per_row_count ) - { - case 2: - csr_multiply_sm70::count_non_zeroes_kernel< 2, CTA_SIZE, SMEM_SIZE, WARP_SIZE, false> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - NULL, - NULL, - NULL, - NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - NULL ); - break; - - case 4: - csr_multiply_sm70::count_non_zeroes_kernel< 4, CTA_SIZE, SMEM_SIZE, WARP_SIZE, false> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - NULL, - NULL, - NULL, - NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - NULL ); - break; - - case 8: - csr_multiply_sm70::count_non_zeroes_kernel< 8, CTA_SIZE, SMEM_SIZE, WARP_SIZE, false> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - NULL, - NULL, - NULL, - NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - NULL ); - break; - - case 16: - csr_multiply_sm70::count_non_zeroes_kernel<16, CTA_SIZE, SMEM_SIZE, WARP_SIZE, false> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - NULL, - NULL, - NULL, - NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - NULL ); - break; - - default: - csr_multiply_sm70::count_non_zeroes_kernel <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - NULL, - NULL, - NULL, - NULL, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - NULL ); - } - - cudaCheckError(); - //CUDA_SAFE_CALL( cudaGetLastError() ); -} - -// ==================================================================================================================== - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -void CSR_Multiply_Sm70 >::compute_sparsity_ilu1( const Matrix_d &A, Matrix_d &B ) -{ - const int GRID_SIZE = 128; - - const int CTA_SIZE = 256; - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // Reset work queue. - int work_offset = GRID_SIZE * NUM_WARPS; - CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); - - // Count the number of non zeroes. - if ( A.hasProps(DIAG) ) - count_non_zeroes_ilu1_dispatch( - A, - B, - this->m_num_threads_per_row_count, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - NULL ); - else - count_non_zeroes_ilu1_dispatch( - A, - B, - this->m_num_threads_per_row_count, - this->m_gmem_size, - this->m_keys, - this->m_work_queue, - NULL ); - - // Make sure it worked properly. - CUDA_SAFE_CALL( cudaGetLastError() ); -} - -// ==================================================================================================================== - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -void CSR_Multiply_Sm70 >::compute_values( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, int num_threads, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2 ) -{ - const int GRID_SIZE = 256; - const int CTA_SIZE = 128; - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // Reset the work queue. - int work_offset = GRID_SIZE * NUM_WARPS; - CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); - // Compute the values. - int *status = NULL; - - if ( num_threads != this->m_num_threads_per_row_compute ) - { - status = this->m_status; - } - - switch ( num_threads ) - { - case 2: - csr_multiply_sm70::compute_values_kernel< 2, Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - A.values.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - B.values.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - C.values.raw(), - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_vals, - this->m_work_queue, - status ); - break; - - case 4: - csr_multiply_sm70::compute_values_kernel< 4, Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - A.values.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - B.values.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - C.values.raw(), - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_vals, - this->m_work_queue, - status ); - break; - - case 8: - csr_multiply_sm70::compute_values_kernel< 8, Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - A.values.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - B.values.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - C.values.raw(), - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_vals, - this->m_work_queue, - status ); - break; - - case 16: - csr_multiply_sm70::compute_values_kernel<16, Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - A.values.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - B.values.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - C.values.raw(), - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_vals, - this->m_work_queue, - status ); - break; - - default: - csr_multiply_sm70::compute_values_kernel <<< GRID_SIZE, CTA_SIZE>>>( - A.get_num_rows(), - A.row_offsets.raw(), - A.col_indices.raw(), - A.values.raw(), - B.row_offsets.raw(), - B.col_indices.raw(), - B.values.raw(), - C.row_offsets.raw(), - C.col_indices.raw(), - C.values.raw(), - (Aq1 != NULL) ? Aq1->raw() : NULL, - (Bq1 != NULL) ? Bq1->raw() : NULL, - (Aq2 != NULL) ? Aq2->raw() : NULL, - (Bq2 != NULL) ? Bq2->raw() : NULL, - this->m_gmem_size, - this->m_keys, - this->m_vals, - this->m_work_queue, - status ); - } - - cudaCheckError(); - //CUDA_SAFE_CALL( cudaGetLastError() ); -} - - -template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > -void CSR_Multiply_Sm70 >::compute_values_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector &RAP_ext_row_offsets, std::vector &RAP_ext_col_indices, std::vector &RAP_ext_values, std::vector &RAP_ext_row_ids, int num_threads) -{ - const int GRID_SIZE = 128; - const int CTA_SIZE = 128; - const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // Reset the work queue. - int work_offset = GRID_SIZE * NUM_WARPS; - CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); - // Compute the values. - int *status = NULL; - - if ( num_threads != this->m_num_threads_per_row_compute ) - { - status = this->m_status; - } - - // This is num_owned_coarse_rows - int RAP_size = RAP.get_num_rows(); - int RAP_int_size = RAP_int.row_offsets.size() - 1; - - if (RAP_int_size < RAP_size) - { - FatalError("RAP_int has less rows than RAP, need to modify sparse RAP add to handle that case\n", AMGX_ERR_NOT_IMPLEMENTED); - } - - //TODO: Optimize: reuse arrays from count nonzeros - int num_neighbors = RAP_ext_row_offsets.size(); - std::vector flagArray(num_neighbors); - - for (int i = 0; i < num_neighbors; i++) - { - flagArray[i].resize(RAP_size); - thrust::fill(flagArray[i].begin(), flagArray[i].end(), -1); - } - - cudaCheckError(); - std::vector flagArray_ptrs_h(num_neighbors); - std::vector RAP_ext_row_offsets_ptrs_h(num_neighbors); - std::vector RAP_ext_col_indices_ptrs_h(num_neighbors); - std::vector RAP_ext_values_ptrs_h(num_neighbors); - - for (int i = 0; i < num_neighbors; i++) - { - flagArray_ptrs_h[i] = thrust::raw_pointer_cast(&flagArray[i][0]); - RAP_ext_row_offsets_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_row_offsets[i][0]); - RAP_ext_col_indices_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_col_indices[i][0]); - RAP_ext_values_ptrs_h[i] = thrust::raw_pointer_cast(&RAP_ext_values[i][0]); - } - - device_vector_alloc flagArray_ptrs = flagArray_ptrs_h; - device_vector_alloc RAP_ext_row_offsets_ptrs = RAP_ext_row_offsets_ptrs_h; - device_vector_alloc RAP_ext_col_indices_ptrs = RAP_ext_col_indices_ptrs_h; - device_vector_alloc RAP_ext_values_ptrs = RAP_ext_values_ptrs_h; - - for (int i = 0; i < num_neighbors; i++) - { - int size = RAP_ext_row_ids[i].size(); - - if (size != 0) - { - int num_blocks = min(4096, (size + 127) / 128); - //write the position in RAP_ext_row_ids - csr_multiply_sm70::flag_halo_rows <<< num_blocks, 128>>>( - RAP_ext_row_ids[i].raw(), - size, - flagArray[i].raw(), - i, - RAP.manager->global_id()); - } - } - - cudaCheckError(); - //CUDA_SAFE_CALL( cudaGetLastError() ); - csr_multiply_sm70::compute_values_RAP_ext_kernel< Value_type, CTA_SIZE, SMEM_SIZE, WARP_SIZE> <<< GRID_SIZE, CTA_SIZE>>>( - RAP_size, - RAP_int.row_offsets.raw(), - RAP_int.col_indices.raw(), - RAP_int.values.raw(), - thrust::raw_pointer_cast(&RAP_ext_row_offsets_ptrs[0]), - thrust::raw_pointer_cast(&RAP_ext_col_indices_ptrs[0]), - thrust::raw_pointer_cast(&RAP_ext_values_ptrs[0]), - RAP.row_offsets.raw(), - RAP.col_indices.raw(), - RAP.values.raw(), - thrust::raw_pointer_cast(&flagArray_ptrs[0]), - this->m_gmem_size, - this->m_keys, - this->m_vals, - this->m_work_queue, - num_neighbors, - status ); - cudaCheckError(); - //CUDA_SAFE_CALL( cudaGetLastError() ); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#define AMGX_CASE_LINE(CASE) template class CSR_Multiply_Sm70::Type>; -AMGX_FORALL_BUILDS(AMGX_CASE_LINE) -AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE) -#undef AMGX_CASE_LINE - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace amgx - diff --git a/base/src/multiply.cu b/base/src/multiply.cu index 4de2781b..333d756d 100644 --- a/base/src/multiply.cu +++ b/base/src/multiply.cu @@ -395,10 +395,10 @@ void blockDiaCsrMultiplyKernel(const IndexType *row_offsets, template< typename IndexType, typename ValueTypeA, typename ValueTypeB, int CTA_SIZE, bool ROW_MAJOR > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 16 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 16 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets, const IndexType *column_indices, @@ -418,6 +418,11 @@ void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets, const int laneId_div_16 = laneId / 16; const int upperHalf = 16 * laneId_div_16; const int upperMask = 0xffff << upperHalf; +#if __CUDA_ARCH__ < 300 + const int nWarps = CTA_SIZE / 32; // Number of half warps per CTA. + const int warpId = threadIdx.x / 32; + volatile __shared__ IndexType s_aColIds[nWarps][32]; +#endif // Shared memory needed to exchange X and delta. __shared__ volatile ValueTypeB s_mem[CTA_SIZE]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -465,13 +470,20 @@ void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets, aColId = column_indices[aColIt]; } +#if __CUDA_ARCH__ < 300 + s_aColIds[warpId][laneId] = aColId; +#endif // Loop over columns. We compute 8 columns per iteration. for ( int k = 0, nCols = __popc( utils::ballot(aColId != -1) & upperMask ) ; k < nCols ; k += 4 ) { int my_k = k + halfLaneId_div_4; // Exchange column indices. +#if __CUDA_ARCH__ >= 300 int waColId = utils::shfl( aColId, upperHalf + my_k ); +#else + int waColId = s_aColIds[warpId][upperHalf + my_k]; +#endif // Load 8 blocks of X if needed. ValueTypeB my_x = types::util::get_zero(); @@ -513,6 +525,8 @@ void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets, } // Loop over aColIt // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 + if ( ROW_MAJOR ) { my_Ax = my_Ax + utils::shfl_xor( my_Ax, 1 ); @@ -524,6 +538,40 @@ void blockDiaCsrMultiplyKernelDiaProps_4x4( const IndexType *row_offsets, my_Ax = my_Ax + utils::shfl_xor( my_Ax, 8 ); } +#else + types::util::volcast(my_Ax, s_mem + threadIdx.x); + { + if ( ROW_MAJOR ) + { + if ( laneId < 31 ) + { + my_Ax = my_Ax + types::util::volcast(s_mem[threadIdx.x + 1]); + types::util::volcast(my_Ax, s_mem + threadIdx.x); + } + + if ( laneId < 30 ) + { + my_Ax = my_Ax + types::util::volcast(s_mem[threadIdx.x + 2]); + types::util::volcast(my_Ax, s_mem + threadIdx.x); + } + } + else + { + if ( laneId < 31 ) + { + my_Ax = my_Ax + types::util::volcast(s_mem[threadIdx.x + 4]); + types::util::volcast(my_Ax, s_mem + threadIdx.x); + } + + if ( laneId < 30 ) + { + my_Ax = my_Ax + types::util::volcast(s_mem[threadIdx.x + 8]); + types::util::volcast(my_Ax, s_mem + threadIdx.x); + } + } + } +#endif + // Store the results. if ( ROW_MAJOR ) { diff --git a/core/include/matrix_coloring/bfs.h b/core/include/matrix_coloring/bfs.h index 7fa21715..82b88dd0 100644 --- a/core/include/matrix_coloring/bfs.h +++ b/core/include/matrix_coloring/bfs.h @@ -42,6 +42,7 @@ template __device__ __forceinline__ T warp const int warpId = utils::warp_id(); const int laneId = utils::lane_id(); T value = input; +#if __CUDA_ARCH__ >= 300 #pragma unroll for (int i = 1; i < WARP_SIZE; i *= 2) @@ -54,6 +55,22 @@ template __device__ __forceinline__ T warp } } +#else + volatile __shared__ T s_shfl[CTA_SIZE + WARP_SIZE / 2]; +#pragma unroll + + for (int i = 1; i < WARP_SIZE; i *= 2) + { + s_shfl[threadIdx.x] = value; + T n = s_shfl[threadIdx.x - i]; + + if (laneId >= i) + { + value += n; + } + } + +#endif return value; } @@ -147,7 +164,11 @@ __global__ void bfs_expand( for (int i = 0; i < n_neighbors; ++i) { +#if __CUDA_ARCH__ >= 350 int col = __ldg(col_indices + row_begin + i); +#else + int col = col_indices[row_begin + i]; +#endif task_queue_out[block_tail_ + n_neighbors_exclusive_scan + i] = col; } @@ -165,7 +186,15 @@ struct filter_visited_closure __device__ inline int operator()(const int &index) { +#if __CUDA_ARCH__ >= 350 + if (__ldg(distances_ptr + index) >= 0) { return 1; } + +#else + + if (distances_ptr[index] >= 0) { return 1; } + +#endif return 0; } }; diff --git a/core/include/matrix_coloring/coloring_utils.h b/core/include/matrix_coloring/coloring_utils.h index 5a1997c8..99dfb278 100644 --- a/core/include/matrix_coloring/coloring_utils.h +++ b/core/include/matrix_coloring/coloring_utils.h @@ -232,7 +232,9 @@ struct used_color_structure_64_bit template __device__ __forceinline__ void sync_subwarp(const int sublane_id) { +#if __CUDA_ARCH__ >= 300 #pragma unroll + for (int i = WARP_SIZE / 2; i >= 1; i /= 2) { int tmp_hi = __double2hiint( __longlong_as_double( bitfield ) ); @@ -242,6 +244,20 @@ struct used_color_structure_64_bit long long tmp = __double_as_longlong(__hiloint2double(tmp_hi, tmp_lo)); bitfield |= tmp; } + +#else + __shared__ volatile long long s_used_colors[CTA_SIZE + WARP_SIZE / 2]; +#pragma unroll + + for (int i = 1; i <= WARP_SIZE / 2; i *= 2) + { + s_used_colors[threadIdx.x] = bitfield; + long long tmp = s_used_colors[threadIdx.x + i]; + + if (sublane_id + i < WARP_SIZE) { bitfield |= tmp; } + } + +#endif } __device__ __forceinline__ void aggregate(const used_color_structure_64_bit &b) { @@ -364,7 +380,9 @@ unsigned long long int box_id : __device__ __forceinline__ void sync_subwarp(const int sublane_id) { used_color_structure_64_bit_colorbox tmps; +#if __CUDA_ARCH__ >= 300 #pragma unroll + for (int i = WARP_SIZE / 2; i >= 1; i /= 2) { int tmp_hi = __double2hiint( __longlong_as_double( data.bitfield ) ); @@ -375,6 +393,21 @@ unsigned long long int box_id : tmps.data.bitfield = tmp; //get neighbors bitfield aggregate(tmps); } + +#else + __shared__ volatile unsigned long long s_used_colors[CTA_SIZE + WARP_SIZE / 2]; +#pragma unroll + + for (int i = 1; i <= WARP_SIZE / 2; i *= 2) + { + s_used_colors[threadIdx.x] = data.bitfield; + unsigned long long tmp = s_used_colors[threadIdx.x + i]; + tmps.data.bitfield = tmp; //get neighbors bitfield + + if (sublane_id + i < WARP_SIZE) { aggregate(tmps); } + } + +#endif } }; diff --git a/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu b/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu index e0c6c52a..70787114 100644 --- a/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu +++ b/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu @@ -38,7 +38,6 @@ #include #include #include -#include #include @@ -51,12 +50,13 @@ namespace aggregation #include -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 -#include // Included inside the namespace to solve name collisions. +#include // Included inside the namespace to solve name collisions. static __device__ __forceinline__ int get_work( int *queue, int warp_id, int count = 1 ) { +#if __CUDA_ARCH__ >= 300 int offset = -1; if ( utils::lane_id() == 0 ) @@ -65,22 +65,23 @@ static __device__ __forceinline__ int get_work( int *queue, int warp_id, int cou } return utils::shfl( offset, 0 ); +#else + return 0; +#endif } #else -#include // Included inside the namespace to solve name collisions. +#include // Included inside the namespace to solve name collisions. -static __device__ __forceinline__ int get_work( int *queue, int warp_id, int count = 1 ) +static __device__ __forceinline__ int get_work( volatile int *offsets, int *queue, int warp_id, int count = 1 ) { - int offset = -1; - if ( utils::lane_id() == 0 ) { - offset = atomicAdd( queue, count ); + offsets[warp_id] = atomicAdd( queue, count ); } - return utils::shfl( offset, 0 ); + return offsets[warp_id]; } #endif @@ -108,6 +109,12 @@ compute_sparsity_kernel( const int R_num_rows, // same as num_aggregates. const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; // The hash keys stored in shared memory. __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; +#if __CUDA_ARCH__ < 300 + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to vote. + __shared__ volatile int s_bcast_cols[CTA_SIZE]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); @@ -117,9 +124,18 @@ compute_sparsity_kernel( const int R_num_rows, // same as num_aggregates. // First threads load the row IDs of A needed by the CTA... int r_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. +#if __CUDA_ARCH__ >= 300 + Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size ); +#else Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size ); +#endif // Loop over rows of R. +#if __CUDA_ARCH__ >= 300 + for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) ) +#else + for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) ) +#endif { // Make sure we have to proceed. if ( COUNT_ONLY ) @@ -151,6 +167,9 @@ compute_sparsity_kernel( const int R_num_rows, // same as num_aggregates. a_row_id = R_cols[r_col_it]; } +#if __CUDA_ARCH__ < 300 + s_bcast_cols[threadIdx.x] = a_row_id; +#endif const int num_rows = __popc( utils::ballot(is_active) ); // Uniform loop: threads collaborate to load other elements. @@ -160,7 +179,17 @@ compute_sparsity_kernel( const int R_num_rows, // same as num_aggregates. // Is it an active thread. bool is_active_k = local_k < num_rows; // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd). +#if __CUDA_ARCH__ >= 300 const int uniform_a_row_id = utils::shfl( a_row_id, local_k ); +#else + int uniform_a_row_id = -1; + + if ( is_active_k ) + { + uniform_a_row_id = s_bcast_cols[warp_id * WARP_SIZE + local_k]; + } + +#endif // Load the range of the row of B. int a_col_it = 0, a_col_end = 0; @@ -213,10 +242,10 @@ compute_sparsity_kernel( const int R_num_rows, // same as num_aggregates. template< typename Value_type, int NUM_THREADS_PER_ROW, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool HAS_DIAG > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 8 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 8 ) +__launch_bounds__( CTA_SIZE, 6 ) #endif void fill_A_kernel_1x1( const int R_num_rows, const int *R_rows, @@ -240,8 +269,19 @@ void fill_A_kernel_1x1( const int R_num_rows, const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; // The hash keys stored in shared memory. __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; +#if __CUDA_ARCH__ >= 300 // The hash values stored in shared memory. __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; +#else + // Shared memory to vote. + __shared__ volatile int s_bcast_row[CTA_SIZE]; + // The hash keys stored in shared memory. + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to reduce the diagonal. + __shared__ volatile Value_type s_diag[CTA_SIZE]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); @@ -251,12 +291,24 @@ void fill_A_kernel_1x1( const int R_num_rows, // First threads load the row IDs of A needed by the CTA... int r_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. +#if __CUDA_ARCH__ >= 300 Hash_map map( &s_keys[warp_id * SMEM_SIZE ], &g_keys[r_row_id * gmem_size ], &s_vote[warp_id * SMEM_SIZE / 4], &g_vals[r_row_id * gmem_size ], gmem_size ); +#else + Hash_map map( &s_keys[warp_id * SMEM_SIZE ], + &g_keys[r_row_id * gmem_size], + &s_vals[warp_id * SMEM_SIZE ], + &g_vals[r_row_id * gmem_size], gmem_size ); +#endif // Loop over rows of A. +#if __CUDA_ARCH__ >= 300 + for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) ) +#else + for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) ) +#endif { // The indices of the output row. int ac_col_it = Ac_rows[r_row_id + 0]; @@ -283,6 +335,10 @@ void fill_A_kernel_1x1( const int R_num_rows, a_row_id = R_cols[r_col_it]; } +#if __CUDA_ARCH__ < 300 + s_bcast_row[threadIdx.x] = a_row_id; +#endif + // Update the diagonal (if needed). if ( HAS_DIAG && is_active ) { @@ -296,7 +352,17 @@ void fill_A_kernel_1x1( const int R_num_rows, { int local_k = k + lane_id_div_num_threads; // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd). +#if __CUDA_ARCH__ >= 300 const int uniform_a_row_id = utils::shfl( a_row_id, local_k ); +#else + int uniform_a_row_id = -1; + + if ( local_k < num_rows ) + { + uniform_a_row_id = s_bcast_row[warp_id * WARP_SIZE + local_k]; + } + +#endif // The range of the row of B. int a_col_it = 0, a_col_end = 0; @@ -342,7 +408,16 @@ void fill_A_kernel_1x1( const int R_num_rows, // Update the diagonal. if ( HAS_DIAG ) { +#if __CUDA_ARCH__ >= 300 r_diag = utils::warp_reduce<1, utils::Add>( r_diag ); +#else + types::util::volcast(r_diag, s_diag + threadIdx.x); +#ifdef _MSC_VER + r_diag = utils::warp_reduce_sum<1, Value_type>(s_diag, r_diag); +#else + r_diag = utils::warp_reduce<1, utils::Add>(s_diag, r_diag); +#endif +#endif if ( lane_id == 0 ) { @@ -365,10 +440,10 @@ void fill_A_kernel_1x1( const int R_num_rows, /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template< typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool HAS_DIAG > -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 -__global__ __launch_bounds__( CTA_SIZE, 8 ) -#else +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __global__ __launch_bounds__( CTA_SIZE, 8 ) +#else +__global__ __launch_bounds__( CTA_SIZE ) #endif void fill_A_kernel_4x4( const int R_num_rows, // same as num_aggregates. const int *R_rows, @@ -391,21 +466,46 @@ void fill_A_kernel_4x4( const int R_num_rows, // same as num_aggregates. const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; +#if __CUDA_ARCH__ < 300 + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to vote. + __shared__ volatile int s_bcast_cols[CTA_SIZE]; + // The table to share aggregates. + __shared__ volatile int s_bcast_ac_col[CTA_SIZE]; + // Shared memory to broadcast indices. + __shared__ volatile int s_bcast_ac_idx[CTA_SIZE]; + // Shared memory to broadcast values. + __shared__ volatile Value_type s_bcast_ac_val[CTA_SIZE / 2]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); // Constants. const int lane_id_div_16 = lane_id / 16; const int lane_id_mod_16 = lane_id % 16; +#if __CUDA_ARCH__ >= 300 const int warp_offset = 16 * lane_id_div_16; +#else + const int cta_offset = 16 * (threadIdx.x / 16); +#endif // First threads load the row IDs of A needed by the CTA... int r_row_id = blockIdx.x * NUM_WARPS + warp_id; // My index. Hash_index index( &g_idx[r_row_id * gmem_size] ); // Create local storage for the set. +#if __CUDA_ARCH__ >= 300 Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size ); +#else + Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size ); +#endif // Loop over rows of R. +#if __CUDA_ARCH__ >= 300 + for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) ) +#else + for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) ) +#endif { // The indices of the row. int ac_col_it = Ac_rows[r_row_id + 0]; @@ -470,12 +570,21 @@ void fill_A_kernel_4x4( const int R_num_rows, // same as num_aggregates. } int ac_idx = ac_col_it + set.find_index( key, index, false ); +#if __CUDA_ARCH__ < 300 + s_bcast_ac_col[threadIdx.x] = ac_col_id; + s_bcast_ac_idx[threadIdx.x] = ac_idx; +#endif // Iterate over the 16 items. for ( int k = 0 ; k < 16 ; ++k ) { +#if __CUDA_ARCH__ >= 300 int uniform_ac_col = utils::shfl( ac_col_id, warp_offset + k ); int uniform_ac_idx = utils::shfl( ac_idx, warp_offset + k ); +#else + int uniform_ac_col = s_bcast_ac_col[cta_offset + k]; + int uniform_ac_idx = s_bcast_ac_idx[cta_offset + k]; +#endif // Early loop exit. if ( utils::all( uniform_ac_col == -1 ) ) @@ -493,6 +602,8 @@ void fill_A_kernel_4x4( const int R_num_rows, // same as num_aggregates. a_value = A_vals[16 * uniform_a_col_it + lane_id_mod_16]; } +#if __CUDA_ARCH__ >= 300 + // Proceed diagonal if needed. if ( HAS_DIAG && uniform_ac_col == r_row_id ) { @@ -519,13 +630,45 @@ void fill_A_kernel_4x4( const int R_num_rows, // same as num_aggregates. { Ac_vals[16 * uniform_ac_idx + lane_id_mod_16] = Ac_vals[16 * uniform_ac_idx + lane_id_mod_16] + a_value; } + +#else + + // Update the diagonal if it is a diagonal term. + if ( HAS_DIAG && uniform_ac_col == r_row_id ) + { + ac_diag = ac_diag + a_value; + uniform_ac_col = -1; + } + + // Update the value. + if ( uniform_ac_col != -1 ) + { + utils::atomic_add( &Ac_vals[16 * uniform_ac_idx + lane_id_mod_16], a_value ); + } + +#endif } } } if ( HAS_DIAG ) { +#if __CUDA_ARCH__ >= 300 ac_diag = ac_diag + utils::shfl_xor( ac_diag, 16 ); +#else + + if ( lane_id_div_16 == 1 ) + { + types::util::volcast(ac_diag, s_bcast_ac_val + 16 * warp_id + lane_id_mod_16); + } + +// + s_bcast_ac_val[16*warp_id + lane_id_mod_16] = ac_diag) + if ( lane_id_div_16 == 0 ) + { + ac_diag = ac_diag + types::util::volcast(s_bcast_ac_val[16 * warp_id + lane_id_mod_16]); + } + +#endif if ( lane_id_div_16 == 0 ) { @@ -538,10 +681,10 @@ void fill_A_kernel_4x4( const int R_num_rows, // same as num_aggregates. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template< typename Value_type, int N, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool HAS_DIAG, bool FORCE_DETERMINISM > -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __global__ __launch_bounds__( CTA_SIZE, 8 ) #else -__global__ __launch_bounds__( CTA_SIZE, 8 ) +__global__ __launch_bounds__( CTA_SIZE ) #endif void fill_A_kernel_NxN( const int R_num_rows, // same as num_aggregates. const int *R_rows, @@ -569,22 +712,46 @@ void fill_A_kernel_NxN( const int R_num_rows, // same as num_aggregates. const int NUM_ITEMS_PER_WARP = T_WARP == 0 ? 1 : T_WARP; // The hash keys stored in shared memory. __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; +#if __CUDA_ARCH__ < 300 + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to vote. + __shared__ volatile int s_bcast_cols[CTA_SIZE]; + // The table to share aggregates. + __shared__ volatile int s_bcast_ac_col[CTA_SIZE]; + // Shared memory to broadcast indices. + __shared__ volatile int s_bcast_ac_idx[CTA_SIZE]; + // Shared memory to broadcast values. + __shared__ volatile Value_type s_bcast_ac_val[CTA_SIZE]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); // Constants. const int lane_id_div_NxN = lane_id / NxN; const int lane_id_mod_NxN = lane_id % NxN; +#if __CUDA_ARCH__ >= 300 const int warp_offset = NxN * lane_id_div_NxN; +#else + const int cta_offset = warp_id * WARP_SIZE + NxN * lane_id_div_NxN; +#endif // First threads load the row IDs of A needed by the CTA... int r_row_id = blockIdx.x * NUM_WARPS + warp_id; // My index. Hash_index index( &g_idx[r_row_id * gmem_size] ); // Create local storage for the set. +#if __CUDA_ARCH__ >= 300 + Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size ); +#else Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size ); +#endif // Loop over rows of R. +#if __CUDA_ARCH__ >= 300 for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) ) +#else + for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) ) +#endif { // The indices of the row. int ac_col_it = Ac_rows[r_row_id + 0]; @@ -651,10 +818,15 @@ void fill_A_kernel_NxN( const int R_num_rows, // same as num_aggregates. } int ac_idx = ac_col_it + set.find_index( key, index, false ); +#if __CUDA_ARCH__ < 300 + s_bcast_ac_col[threadIdx.x] = ac_col_id; + s_bcast_ac_idx[threadIdx.x] = ac_idx; +#endif // Iterate over the NxN items. for ( int k = 0 ; k < NxN ; ++k ) { +#if __CUDA_ARCH__ >= 300 int uniform_ac_col = utils::shfl( ac_col_id, warp_offset + k ); int uniform_ac_idx = utils::shfl( ac_idx, warp_offset + k ); @@ -664,6 +836,17 @@ void fill_A_kernel_NxN( const int R_num_rows, // same as num_aggregates. uniform_ac_idx = -1; } +#else + int uniform_ac_col = -1, uniform_ac_idx = -1; + + if ( lane_id_div_NxN < NUM_ITEMS_PER_WARP ) + { + uniform_ac_col = s_bcast_ac_col[cta_offset + k]; + uniform_ac_idx = s_bcast_ac_idx[cta_offset + k]; + } + +#endif + // Early loop exit. if ( utils::all( uniform_ac_col == -1 ) ) { @@ -700,7 +883,16 @@ void fill_A_kernel_NxN( const int R_num_rows, // same as num_aggregates. { if ( !FORCE_DETERMINISM ) { +#if __CUDA_ARCH__ >= 300 ac_diag = utils::warp_reduce( ac_diag ); +#else + types::util::volcast(ac_diag, s_bcast_ac_val + threadIdx.x); +#ifdef _MSC_VER + ac_diag = utils::warp_reduce_sum(s_bcast_ac_val, ac_diag); +#else + ac_diag = utils::warp_reduce( s_bcast_ac_val, ac_diag ); +#endif +#endif } if ( lane_id_div_NxN == 0 ) @@ -713,10 +905,10 @@ void fill_A_kernel_NxN( const int R_num_rows, // same as num_aggregates. // when blocksize is larger than warp size template< typename Value_type, int N, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool HAS_DIAG, bool FORCE_DETERMINISM, int NUM_BLOCK_ITERS_PER_WARP> -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __global__ __launch_bounds__( CTA_SIZE, 8 ) #else -__global__ __launch_bounds__( CTA_SIZE, 8 ) +__global__ __launch_bounds__( CTA_SIZE ) #endif void fill_A_kernel_NxN_large( const int R_num_rows, // same as num_aggregates. const int *R_rows, @@ -743,6 +935,18 @@ void fill_A_kernel_NxN_large( const int R_num_rows, // same as num_aggregates. const int NUM_ITEMS_PER_WARP = 1; // The hash keys stored in shared memory. __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; +#if __CUDA_ARCH__ < 300 + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to vote. + __shared__ volatile int s_bcast_cols[CTA_SIZE]; + // The table to share aggregates. + __shared__ volatile int s_bcast_ac_col[CTA_SIZE]; + // Shared memory to broadcast indices. + __shared__ volatile int s_bcast_ac_idx[CTA_SIZE]; + // Shared memory to broadcast values. + __shared__ volatile Value_type s_bcast_ac_val[CTA_SIZE]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -751,9 +955,18 @@ void fill_A_kernel_NxN_large( const int R_num_rows, // same as num_aggregates. // My index. Hash_index index( &g_idx[r_row_id * gmem_size] ); // Create local storage for the set. +#if __CUDA_ARCH__ >= 300 + Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size ); +#else Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[r_row_id * gmem_size], gmem_size ); +#endif // Loop over rows of R. +#if __CUDA_ARCH__ >= 300 + for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) ) +#else + for ( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) ) +#endif { // The indices of the row. int ac_col_it = Ac_rows[r_row_id + 0]; @@ -808,12 +1021,21 @@ void fill_A_kernel_NxN_large( const int R_num_rows, // same as num_aggregates. } int ac_idx = ac_col_it + set.find_index( key, index, false ); +#if __CUDA_ARCH__ < 300 + s_bcast_ac_col[threadIdx.x] = ac_col_id; + s_bcast_ac_idx[threadIdx.x] = ac_idx; +#endif // Iterate over the NxN items. for ( int k = 0 ; k < NxN ; ++k ) { +#if __CUDA_ARCH__ >= 300 int uniform_ac_col = utils::shfl( ac_col_id, k ); int uniform_ac_idx = utils::shfl( ac_idx, k ); +#else + int uniform_ac_col = s_bcast_ac_col[k]; + int uniform_ac_idx = s_bcast_ac_idx[k]; +#endif // Early loop exit. if ( utils::all( uniform_ac_col == -1 ) ) @@ -857,7 +1079,16 @@ void fill_A_kernel_NxN_large( const int R_num_rows, // same as num_aggregates. { if ( !FORCE_DETERMINISM ) { +#if __CUDA_ARCH__ >= 300 ac_diag = utils::warp_reduce( ac_diag ); +#else + types::util::volcast(ac_diag, s_bcast_ac_val + threadIdx.x); +#ifdef _MSC_VER + ac_diag = utils::warp_reduce_sum(s_bcast_ac_val, ac_diag); +#else + ac_diag = utils::warp_reduce( s_bcast_ac_val, ac_diag ); +#endif +#endif } Ac_vals[NxN * Ac_diag[r_row_id] + lane_id] = ac_diag; diff --git a/core/src/aggregation/selectors/parallel_greedy_selector.cu b/core/src/aggregation/selectors/parallel_greedy_selector.cu index d54e5dac..4a52cca3 100755 --- a/core/src/aggregation/selectors/parallel_greedy_selector.cu +++ b/core/src/aggregation/selectors/parallel_greedy_selector.cu @@ -189,6 +189,7 @@ void compute_ring_leader( const int A_num_rows, int *__restrict out_leader_id, int *__restrict out_leader_hash ) { +#if __CUDA_ARCH__ >= 300 const int NUM_WARPS_PER_CTA = CTA_SIZE / WARP_SIZE; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); @@ -289,6 +290,8 @@ void compute_ring_leader( const int A_num_rows, out_leader_hash[a_row_id] = my_max_hash; } } + +#endif } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -308,6 +311,7 @@ void build_aggregates( const int num_rings, int *__restrict aggregates, int *__restrict num_unaggregated) { +#if __CUDA_ARCH__ >= 300 const int NUM_WARPS_PER_CTA = CTA_SIZE / WARP_SIZE; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); @@ -447,6 +451,8 @@ void build_aggregates( const int num_rings, curr_row = max_id; } } + +#endif } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/core/src/classical/interpolators/distance2.cu b/core/src/classical/interpolators/distance2.cu index 25e60cf7..1825fb31 100644 --- a/core/src/classical/interpolators/distance2.cu +++ b/core/src/classical/interpolators/distance2.cu @@ -598,34 +598,35 @@ void Distance2_Interpolator -#include // Included inside the namespace to solve name colisions. +#include // Included inside the namespace to solve name colisions. -__device__ __forceinline__ int get_work( int *queue, int warp_id ) +__device__ __forceinline__ int get_work( volatile int *offsets, int *queue, int warp_id ) { - int offset = -1; - if ( utils::lane_id() == 0 ) { - offset = atomicAdd( queue, 1 ); + offsets[warp_id] = atomicAdd( queue, 1 ); } - return utils::shfl( offset, 0 ); + return offsets[warp_id]; } -} // namespace distance2_sm35 +} // namespace distance2_sm20 + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -namespace distance2_sm70 +namespace distance2_sm35 { #include -#include // Included inside the namespace to solve name colisions. +#include // Included inside the namespace to solve name colisions. __device__ __forceinline__ int get_work( int *queue, int warp_id ) { +#if __CUDA_ARCH__ >= 300 int offset = -1; if ( utils::lane_id() == 0 ) @@ -634,9 +635,12 @@ __device__ __forceinline__ int get_work( int *queue, int warp_id ) } return utils::shfl( offset, 0 ); +#else + return 0; +#endif } -} // namespace distance2_sm70 +} // namespace distance2_sm35 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -909,22 +913,30 @@ compute_c_hat_kernel( int A_num_rows, __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; +#if __CUDA_ARCH__ >= 300 +#else + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. -#if __CUDA_ARCH__ >= 700 - distance2_sm70::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); -#else +#if __CUDA_ARCH__ >= 300 distance2_sm35::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); +#else + distance2_sm20::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); #endif // Loop over rows of A. -#if __CUDA_ARCH__ >= 700 - for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) ) -#else +#if __CUDA_ARCH__ >= 300 + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) ) +#else + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) ) #endif { // Skip coarse rows. @@ -939,6 +951,7 @@ compute_c_hat_kernel( int A_num_rows, set.clear(); // Load the range of the row. __syncthreads(); +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -948,6 +961,16 @@ compute_c_hat_kernel( int A_num_rows, int a_col_begin = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + + int a_col_begin = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif __syncthreads(); // _iterate over the columns of A to build C_hat. @@ -1069,7 +1092,13 @@ compute_c_hat_kernel( int A_num_rows, __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; - +#if __CUDA_ARCH__ >= 300 +#else + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -1079,16 +1108,17 @@ compute_c_hat_kernel( int A_num_rows, // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. -#if __CUDA_ARCH__ >= 700 - distance2_sm70::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); -#else +#if __CUDA_ARCH__ >= 300 distance2_sm35::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); +#else + distance2_sm20::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); #endif // Loop over rows of A. -#if __CUDA_ARCH__ >= 700 - for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) ) -#else +#if __CUDA_ARCH__ >= 300 + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) ) +#else + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) ) #endif { // Skip coarse rows. @@ -1101,8 +1131,8 @@ compute_c_hat_kernel( int A_num_rows, // Clear the set. set.clear(); - // Load the range of the row. +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -1112,6 +1142,16 @@ compute_c_hat_kernel( int A_num_rows, int a_col_begin = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + + int a_col_begin = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif // _iterate over the columns of A to build C_hat. for ( int a_col_it = a_col_begin + lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE ) @@ -1250,22 +1290,32 @@ compute_inner_sum_kernel( const int A_num_rows, __shared__ volatile int s_b_row_ids[CTA_SIZE]; // A shared location where threads propose a value. __shared__ volatile Value_type s_a_values[CTA_SIZE]; +#if __CUDA_ARCH__ >= 300 +#else + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS]; + // The sign of the diagonal. + __shared__ volatile bool s_sign_diag[NUM_WARPS]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. -#if __CUDA_ARCH__ >= 700 - distance2_sm70::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); -#else +#if __CUDA_ARCH__ >= 300 distance2_sm35::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); +#else + distance2_sm20::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); #endif // Loop over rows of A. -#if __CUDA_ARCH__ >= 700 - for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) ) -#else +#if __CUDA_ARCH__ >= 300 + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) ) +#else + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) ) #endif { // Skip coarse rows. @@ -1286,6 +1336,7 @@ compute_inner_sum_kernel( const int A_num_rows, int inner_sum_offset = inner_sum_offsets[a_row_id]; // And share the value of the diagonal. bool sign_diag = false; +#if __CUDA_ARCH__ >= 300 if ( lane_id == 0 ) { @@ -1293,8 +1344,17 @@ compute_inner_sum_kernel( const int A_num_rows, } sign_diag = utils::shfl( sign_diag, 0 ); +#else + + if ( lane_id == 0 ) + { + s_sign_diag[warp_id] = sign( diag[a_row_id] ); + } + sign_diag = s_sign_diag[warp_id]; +#endif // Load A row IDs. +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -1304,6 +1364,16 @@ compute_inner_sum_kernel( const int A_num_rows, int a_col_it = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + + int a_col_it = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif // Iterate over the columns of A. for ( a_col_it += lane_id ; utils::any( a_col_it < a_col_end ) ; a_col_it += WARP_SIZE ) @@ -1382,7 +1452,9 @@ compute_inner_sum_kernel( const int A_num_rows, } // Reduce the row to a single value. +#if __CUDA_ARCH__ >= 300 #pragma unroll + for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 ) { int hi = __double2hiint(bottom_sum); @@ -1393,6 +1465,18 @@ compute_inner_sum_kernel( const int A_num_rows, } bottom_sum = utils::shfl( bottom_sum, 0 ); +#else + s_a_values[threadIdx.x] = bottom_sum; +#pragma unroll + + for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 ) + if ( lane_id < offset ) + { + s_a_values[threadIdx.x] = bottom_sum += s_a_values[threadIdx.x + offset]; + } + + bottom_sum = s_a_values[warp_id * WARP_SIZE]; +#endif if ( lane_id == k && bottom_sum != Value_type(0) ) { @@ -1441,6 +1525,15 @@ compute_inner_sum_kernel( const int A_num_rows, __shared__ volatile int s_b_row_ids[CTA_SIZE]; // A shared location where threads propose a value. __shared__ volatile Value_type s_a_values[CTA_SIZE]; +#if __CUDA_ARCH__ >= 300 +#else + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS]; + // The sign of the diagonal. + __shared__ volatile bool s_sign_diag[NUM_WARPS]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); @@ -1450,16 +1543,17 @@ compute_inner_sum_kernel( const int A_num_rows, // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. -#if __CUDA_ARCH__ >= 700 - distance2_sm70::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); -#else +#if __CUDA_ARCH__ >= 300 distance2_sm35::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); +#else + distance2_sm20::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); #endif // Loop over rows of A. -#if __CUDA_ARCH__ >= 700 - for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) ) -#else +#if __CUDA_ARCH__ >= 300 + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) ) +#else + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) ) #endif { // Skip coarse rows. @@ -1480,6 +1574,7 @@ compute_inner_sum_kernel( const int A_num_rows, int inner_sum_offset = inner_sum_offsets[a_row_id]; // And share the value of the diagonal. bool sign_diag = false; +#if __CUDA_ARCH__ >= 300 if ( lane_id == 0 ) { @@ -1496,6 +1591,23 @@ compute_inner_sum_kernel( const int A_num_rows, int a_col_it = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + + if ( lane_id == 0 ) + { + s_sign_diag[warp_id] = sign( diag[a_row_id] ); + } + + sign_diag = s_sign_diag[warp_id]; + + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + + int a_col_it = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif // Iterate over the columns of A. for ( a_col_it += lane_id ; utils::any( a_col_it < a_col_end ) ; a_col_it += WARP_SIZE ) @@ -1588,7 +1700,9 @@ compute_inner_sum_kernel( const int A_num_rows, } // Reduce the row to a single value. +#if __CUDA_ARCH__ >= 300 #pragma unroll + for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 ) { int hi = __double2hiint(bottom_sum); @@ -1599,6 +1713,18 @@ compute_inner_sum_kernel( const int A_num_rows, } bottom_sum = utils::shfl( bottom_sum, lane_id_mod_num_threads * NUM_THREADS_PER_ROW ); +#else + s_a_values[threadIdx.x] = bottom_sum; +#pragma unroll + + for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 ) + if ( (lane_id & (NUM_THREADS_PER_ROW - 1)) < offset ) + { + s_a_values[threadIdx.x] = bottom_sum += s_a_values[threadIdx.x + offset]; + } + + bottom_sum = s_a_values[warp_id * WARP_SIZE + lane_id_mod_num_threads * NUM_THREADS_PER_ROW]; +#endif if ( lane_id >= k && lane_id < k + NUM_THREADS_PER_ROW && bottom_sum != Value_type(0) ) { @@ -1650,11 +1776,18 @@ compute_interp_weight_kernel( const int A_num_rows, __shared__ volatile int s_b_row_ids[CTA_SIZE]; // A shared location where threads propose a value. __shared__ volatile Value_type s_aki[NUM_WARPS]; +#if __CUDA_ARCH__ >= 300 // The hash values stored in shared memory. -#if __CUDA_ARCH__ >= 700 - __shared__ volatile distance2_sm70::Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; -#else __shared__ volatile distance2_sm35::Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; +#else + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS]; + // Shared memory to store the values in the hash table. + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; + // Shared memory for broadcast. + __shared__ volatile Value_type s_a_values[CTA_SIZE]; #endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); @@ -1662,24 +1795,25 @@ compute_interp_weight_kernel( const int A_num_rows, // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. -#if __CUDA_ARCH__ >= 700 - distance2_sm70::Hash_map map( &s_keys[warp_id * SMEM_SIZE], +#if __CUDA_ARCH__ >= 300 + distance2_sm35::Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], &s_vote[warp_id * SMEM_SIZE / 4], &g_vals[a_row_id * gmem_size], gmem_size ); #else - distance2_sm35::Hash_map map( &s_keys[warp_id * SMEM_SIZE], + distance2_sm20::Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); #endif // Loop over rows of A. -#if __CUDA_ARCH__ >= 700 - for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm70::get_work( wk_work_queue, warp_id ) ) -#else +#if __CUDA_ARCH__ >= 300 + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm35::get_work( wk_work_queue, warp_id ) ) +#else + for ( ; a_row_id < A_num_rows ; a_row_id = distance2_sm20::get_work( s_offsets, wk_work_queue, warp_id ) ) #endif { int coarse_fine_id = cf_map[a_row_id]; @@ -1709,6 +1843,7 @@ compute_interp_weight_kernel( const int A_num_rows, int c_hat_end = C_hat_end [a_row_id]; map.load( c_hat_end - c_hat_it, &C_hat[c_hat_it], &C_hat_pos[c_hat_it] ); // Load A row IDs. +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -1718,7 +1853,16 @@ compute_interp_weight_kernel( const int A_num_rows, int a_col_it = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + + int a_col_it = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif // The offset in the inner sum table. int inner_sum_offset = inner_sum_offsets[a_row_id]; // Weak value. @@ -1767,6 +1911,7 @@ compute_interp_weight_kernel( const int A_num_rows, int num_rows = __popc( vote ); // We pre-load inner sums. +#if __CUDA_ARCH__ >= 300 sum = Value_type(0); if ( lane_id < num_rows ) @@ -1774,6 +1919,14 @@ compute_interp_weight_kernel( const int A_num_rows, sum = inner_sum[inner_sum_offset + lane_id]; } +#else + + if ( lane_id < num_rows ) + { + s_a_values[threadIdx.x] = inner_sum[inner_sum_offset + lane_id]; + } + +#endif inner_sum_offset += num_rows; // For each warp, we have up to 32 rows of B to proceed. @@ -1794,7 +1947,11 @@ compute_interp_weight_kernel( const int A_num_rows, } // Load the kth inner sum. +#if __CUDA_ARCH__ >= 300 Value_type uniform_val = utils::shfl( sum, k ); +#else + Value_type uniform_val = s_a_values[warp_id * WARP_SIZE + k]; +#endif // _iterate over the range of columns of B. for ( int b_col_it = b_col_begin + lane_id ; utils::any( b_col_it < b_col_end ) ; b_col_it += WARP_SIZE ) @@ -1834,6 +1991,7 @@ compute_interp_weight_kernel( const int A_num_rows, } // We're done with that row of A. We compute D. +#if __CUDA_ARCH__ >= 300 #pragma unroll for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 ) @@ -1848,7 +2006,27 @@ compute_interp_weight_kernel( const int A_num_rows, } sum = utils::shfl( sum, 0 ); +#else + s_a_values[threadIdx.x] = weak; +#pragma unroll + + for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 ) + if ( lane_id < offset ) + { + s_a_values[threadIdx.x] = weak += s_a_values[threadIdx.x + offset]; + } + + weak = s_a_values[warp_id * WARP_SIZE]; + + if ( lane_id == 0 ) + { + weak += diag[a_row_id]; + s_a_values[threadIdx.x] = Value_type(-1) / weak; + } + sum = s_a_values[warp_id * WARP_SIZE]; +#endif +#if __CUDA_ARCH__ >= 300 int p_col_tmp = -1; if ( lane_id < 2 ) @@ -1858,7 +2036,16 @@ compute_interp_weight_kernel( const int A_num_rows, int p_col_it = utils::shfl( p_col_tmp, 0 ); int p_col_end = utils::shfl( p_col_tmp, 1 ); +#else + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = P_rows[a_row_id + lane_id]; + } + + int p_col_it = s_rows[2 * warp_id + 0]; + int p_col_end = s_rows[2 * warp_id + 1]; +#endif map.store_map_keys_scale_values( p_col_end - p_col_it, cf_map, &P_cols[p_col_it], sum, &P_vals[p_col_it] ); } } diff --git a/core/src/classical/interpolators/multipass.cu b/core/src/classical/interpolators/multipass.cu index ce2fd6e3..1df77ba5 100644 --- a/core/src/classical/interpolators/multipass.cu +++ b/core/src/classical/interpolators/multipass.cu @@ -270,34 +270,35 @@ void Multipass_Interpolator -#include // Included inside the namespace to solve name colisions. +#include // Included inside the namespace to solve name colisions. -__device__ __forceinline__ int get_work( int *queue, int warp_id ) +__device__ __forceinline__ int get_work( volatile int *offsets, int *queue, int warp_id ) { - int offset = -1; - if ( utils::lane_id() == 0 ) { - offset = atomicAdd( queue, 1 ); + offsets[warp_id] = atomicAdd( queue, 1 ); } - return utils::shfl( offset, 0 ); + return offsets[warp_id]; } -} // namespace multipass_sm35 +} // namespace multipass_sm20 -namespace multipass_sm70 +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace multipass_sm35 { #include -#include // Included inside the namespace to solve name colisions. +#include // Included inside the namespace to solve name colisions. __device__ __forceinline__ int get_work( int *queue, int warp_id ) { +#if __CUDA_ARCH__ >= 300 int offset = -1; if ( utils::lane_id() == 0 ) @@ -306,9 +307,12 @@ __device__ __forceinline__ int get_work( int *queue, int warp_id ) } return utils::shfl( offset, 0 ); +#else + return 0; +#endif } -} // namespace multipass_sm70 +} // namespace multipass_sm35 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -330,6 +334,9 @@ estimate_c_hat_size_kernel( const int A_num_rows, int *C_hat_offsets, int pass ) { +#if __CUDA_ARCH__ < 300 + __shared__ volatile int s_mem[CTA_SIZE]; +#endif const int NUM_WARPS_PER_CTA = CTA_SIZE / WARP_SIZE; // Number of items per grid. const int NUM_WARPS_PER_GRID = gridDim.x * NUM_WARPS_PER_CTA; @@ -378,6 +385,7 @@ estimate_c_hat_size_kernel( const int A_num_rows, } // Do reduction +#if __CUDA_ARCH__ >= 300 #pragma unroll for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 ) @@ -385,6 +393,18 @@ estimate_c_hat_size_kernel( const int A_num_rows, my_count += utils::shfl_xor( my_count, mask ); } +#else + s_mem[threadIdx.x] = my_count; +#pragma unroll + + for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 ) + if ( lane_id < offset ) + { + s_mem[threadIdx.x] = my_count += s_mem[threadIdx.x + offset]; + } + +#endif + // Write result -- RACE CONDITION! if (lane_id == 0) { @@ -408,6 +428,11 @@ compute_c_hat_first_pass_kernel( int A_num_rows, { const int NUM_WARPS_PER_CTA = CTA_SIZE / WARP_SIZE; const int NUM_WARPS_PER_GRID = gridDim.x * NUM_WARPS_PER_CTA; +#if __CUDA_ARCH__ >= 300 +#else + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS_PER_CTA]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -424,6 +449,7 @@ compute_c_hat_first_pass_kernel( int A_num_rows, } // Load the range of the row. +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -433,7 +459,16 @@ compute_c_hat_first_pass_kernel( int A_num_rows, int a_col_begin = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + int a_col_begin = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif int count = 0; int c_col_it = C_hat_start[a_row_id]; @@ -506,6 +541,13 @@ compute_c_hat_kernel( int A_num_rows, __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. __shared__ volatile KeyType s_keys[NUM_WARPS * SMEM_SIZE]; +#if __CUDA_ARCH__ >= 300 +#else + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -515,16 +557,17 @@ compute_c_hat_kernel( int A_num_rows, // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. -#if __CUDA_ARCH__ >= 700 - multipass_sm70::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); -#else +#if __CUDA_ARCH__ >= 300 multipass_sm35::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); +#else + multipass_sm20::Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); #endif // Loop over rows of A. -#if __CUDA_ARCH__ >= 700 - for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm70::get_work( wk_work_queue, warp_id ) ) -#else +#if __CUDA_ARCH__ >= 300 + for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm35::get_work( wk_work_queue, warp_id ) ) +#else + for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm20::get_work( s_offsets, wk_work_queue, warp_id ) ) #endif { // Skip if not in current pass @@ -536,6 +579,7 @@ compute_c_hat_kernel( int A_num_rows, // Clear the set. set.clear(); // Load the range of the row. +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -545,6 +589,16 @@ compute_c_hat_kernel( int A_num_rows, int a_col_begin = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + + int a_col_begin = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif // _iterate over the columns of A to build C_hat. for ( int a_col_it = a_col_begin + lane_id ; utils::any(a_col_it < a_col_end) ; a_col_it += WARP_SIZE ) @@ -658,7 +712,14 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows, { const int NUM_WARPS_PER_CTA = CTA_SIZE / 32; const int NUM_WARPS_PER_GRID = gridDim.x * NUM_WARPS_PER_CTA; - +#if __CUDA_ARCH__ >= 300 +#else + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS_PER_CTA]; + // Shared memory for broadcast. + __shared__ volatile Value_type s_n_values[CTA_SIZE]; + __shared__ volatile Value_type s_c_values[CTA_SIZE]; +#endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); @@ -690,6 +751,7 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows, else if (assigned[a_row_id] == 1) { // Load A row IDs. +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -699,7 +761,17 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows, int a_col_it = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + int a_col_it = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif +#if __CUDA_ARCH__ >= 300 int p_col_tmp = -1; if ( lane_id < 2 ) @@ -709,7 +781,16 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows, int p_col_it = utils::shfl( p_col_tmp, 0 ); int p_col_end = utils::shfl( p_col_tmp, 1 ); +#else + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = P_rows[a_row_id + lane_id]; + } + + int p_col_it = s_rows[2 * warp_id + 0]; + int p_col_end = s_rows[2 * warp_id + 1]; +#endif // Weak value. Value_type sum_N(0), sum_C(0), alfa(0); int count = 0; @@ -763,7 +844,9 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows, } // We're done with that row of A, now reduce sum_N and sum_C +#if __CUDA_ARCH__ >= 300 #pragma unroll + for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 ) { sum_C += utils::shfl_xor( sum_C, mask ); @@ -777,6 +860,33 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows, } alfa = utils::shfl( alfa, 0 ); +#else + s_c_values[threadIdx.x] = sum_C; + s_n_values[threadIdx.x] = sum_N; +#pragma unroll + + for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 ) + if ( lane_id < offset ) + { + s_c_values[threadIdx.x] = sum_C += s_c_values[threadIdx.x + offset]; + s_n_values[threadIdx.x] = sum_N += s_n_values[threadIdx.x + offset]; + } + + sum_C = s_c_values[warp_id * WARP_SIZE]; + sum_N = s_n_values[warp_id * WARP_SIZE]; + + if ( lane_id == 0 ) + { + if (fabs(sum_C * diag[a_row_id]) < 1e-10) { printf("Dividing by zero\n"); } + + double div = (fabs(sum_C * diag[a_row_id]) < 1e-10) ? 1. : sum_C * diag[a_row_id]; + alfa = -sum_N / div; + // alfa = -sum_N/(sum_C*diag[a_row_id]); + s_c_values[threadIdx.x] = alfa; + } + + alfa = s_c_values[warp_id * WARP_SIZE]; +#endif // Scale the value of P for ( p_col_it += lane_id ; utils::any( p_col_it < p_col_end ) ; p_col_it += WARP_SIZE ) @@ -791,6 +901,8 @@ compute_interp_weight_first_pass_kernel( const int A_num_rows, } // Loop over rows } + + template< typename Value_type, int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, typename KeyType > __global__ __launch_bounds__( CTA_SIZE ) void @@ -815,6 +927,7 @@ compute_interp_weight_kernel( const int A_num_rows, const int *__restrict assigned, int pass) { +#if 1 const int NUM_WARPS = CTA_SIZE / 32; // The hash keys stored in shared memory. __shared__ volatile KeyType s_keys[NUM_WARPS * SMEM_SIZE]; @@ -822,11 +935,18 @@ compute_interp_weight_kernel( const int A_num_rows, __shared__ volatile int s_b_row_ids[CTA_SIZE]; // A shared location where threads store a value of B to load. __shared__ volatile Value_type s_b_values[CTA_SIZE]; +#if __CUDA_ARCH__ >= 300 // The hash values stored in shared memory. -#if __CUDA_ARCH__ >= 700 - __shared__ volatile multipass_sm70::Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; -#else __shared__ volatile multipass_sm35::Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; +#else + // Shared memory to acquire work. + __shared__ volatile int s_offsets[NUM_WARPS]; + // Shared memory to store where to load from. + __shared__ volatile int s_rows[2 * NUM_WARPS]; + // Shared memory to store the values in the hash table. + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; + __shared__ volatile Value_type s_n_values[CTA_SIZE]; + __shared__ volatile Value_type s_c_values[CTA_SIZE]; #endif // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); @@ -834,24 +954,25 @@ compute_interp_weight_kernel( const int A_num_rows, // First threads load the row IDs of A needed by the CTA... volatile int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. -#if __CUDA_ARCH__ >= 700 - multipass_sm70::Hash_map map( &s_keys[warp_id * SMEM_SIZE], +#if __CUDA_ARCH__ >= 300 + multipass_sm35::Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], &s_vote[warp_id * SMEM_SIZE / 4], &g_vals[a_row_id * gmem_size], gmem_size ); #else - multipass_sm35::Hash_map map( &s_keys[warp_id * SMEM_SIZE], + multipass_sm20::Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); #endif // Loop over rows of A. -#if __CUDA_ARCH__ >= 700 - for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm70::get_work( wk_work_queue, warp_id ) ) -#else +#if __CUDA_ARCH__ >= 300 + for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm35::get_work( wk_work_queue, warp_id ) ) +#else + for ( ; a_row_id < A_num_rows ; a_row_id = multipass_sm20::get_work( s_offsets, wk_work_queue, warp_id ) ) #endif { // Only do work if assigned[row_id] == pass @@ -867,6 +988,7 @@ compute_interp_weight_kernel( const int A_num_rows, int c_hat_end = c_hat_it + C_hat_size[a_row_id]; map.load( c_hat_end - c_hat_it, &C_hat[c_hat_it], &C_hat_pos[c_hat_it] ); // Load A row IDs. +#if __CUDA_ARCH__ >= 300 int a_col_tmp = -1; if ( lane_id < 2 ) @@ -876,7 +998,16 @@ compute_interp_weight_kernel( const int A_num_rows, int a_col_it = utils::shfl( a_col_tmp, 0 ); int a_col_end = utils::shfl( a_col_tmp, 1 ); +#else + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = A_rows[a_row_id + lane_id]; + } + + int a_col_it = s_rows[2 * warp_id + 0]; + int a_col_end = s_rows[2 * warp_id + 1]; +#endif // sums Value_type sum_N(0), sum_C(0), alfa(0); @@ -957,7 +1088,9 @@ compute_interp_weight_kernel( const int A_num_rows, } // We're done with that row of A, now reduce sum_N and sum_C +#if __CUDA_ARCH__ >= 300 #pragma unroll + for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 ) { sum_C += utils::shfl_xor( sum_C, mask ); @@ -972,7 +1105,31 @@ compute_interp_weight_kernel( const int A_num_rows, } alfa = utils::shfl( alfa, 0 ); +#else + s_c_values[threadIdx.x] = sum_C; + s_n_values[threadIdx.x] = sum_N; +#pragma unroll + + for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 ) + if ( lane_id < offset ) + { + s_c_values[threadIdx.x] = sum_C += s_c_values[threadIdx.x + offset]; + s_n_values[threadIdx.x] = sum_N += s_n_values[threadIdx.x + offset]; + } + + sum_C = s_c_values[warp_id * WARP_SIZE]; + sum_N = s_n_values[warp_id * WARP_SIZE]; + + if ( lane_id == 0 ) + { + double div = (fabs(sum_C * diag[a_row_id]) < 1e-10) ? 1. : sum_C * diag[a_row_id]; + alfa = -sum_N / div; + s_c_values[threadIdx.x] = alfa; + } + alfa = s_c_values[warp_id * WARP_SIZE]; +#endif +#if __CUDA_ARCH__ >= 300 int p_col_tmp = -1; if ( lane_id < 2 ) @@ -982,9 +1139,20 @@ compute_interp_weight_kernel( const int A_num_rows, int p_col_it = utils::shfl( p_col_tmp, 0 ); int p_col_end = utils::shfl( p_col_tmp, 1 ); +#else + if ( lane_id < 2 ) + { + s_rows[2 * warp_id + lane_id] = P_rows[a_row_id + lane_id]; + } + + int p_col_it = s_rows[2 * warp_id + 0]; + int p_col_end = s_rows[2 * warp_id + 1]; +#endif map.store_keys_scale_values( p_col_end - p_col_it, &P_cols[p_col_it], alfa, &P_vals[p_col_it] ); } + +#endif } } // namespace multipass diff --git a/core/src/classical/strength/affinity.cu b/core/src/classical/strength/affinity.cu index 8daf8eeb..bd48af03 100644 --- a/core/src/classical/strength/affinity.cu +++ b/core/src/classical/strength/affinity.cu @@ -137,9 +137,14 @@ void initRandom(Vector &vec, int size) template static __device__ __inline__ T cahedRead (const T *ar) { +#if __CUDA_ARCH__ >= 350 return utils::Ld::load(ar); +#else + return utils::Ld::load(ar); +#endif } +#if __CUDA_ARCH__ >= 300 // ! shfl is defined for int/float only in arch 600 template< typename ValueType > __forceinline__ __device__ void reduce_inwarp_mul( @@ -160,6 +165,33 @@ __forceinline__ __device__ void reduce_inwarp_mul( *s_xy = sum; } +#else +template< typename ValueType > +__forceinline__ __device__ void reduce_inwarp_mul( + const ValueType vValueA, + const ValueType vValueB, + const int vecId, + const int n_per_warp, + volatile ValueType *smem, + double *s_xy) +{ + ValueType sum = vValueA * vValueB; + smem[threadIdx.x] = sum; +#pragma unroll + + for ( int offset = 16 / n_per_warp ; offset > 0 ; offset /= 2 ) + if ( vecId < offset ) + { + smem[threadIdx.x] = sum = sum + smem[threadIdx.x + offset]; + } + + if ( vecId == 0 ) + { + // If laneId=0, then sum is in smem[threadIdx.x]. + *s_xy = sum; + } +} +#endif template< typename ValueType > __forceinline__ __device__ void reduce_inblock_mul( @@ -240,7 +272,11 @@ void computeAffinity_1x1_Kernel(const int *A_rows, const int num_vecs_per_warp = 32 / nTV; const int num_rows_per_iter = gridDim.x * blockDim.x / nTV; //const int num_vecs_per_block = blockDim.x / nTV; +#if __CUDA_ARCH__ >= 300 ValueTypeB *smem = NULL; +#else + __shared__ volatile ValueTypeB smem[kCtaSize]; +#endif double s_xx, s_xy, s_yy; for ( int aRowId = tid / nTV ; aRowId < A_num_rows ; diff --git a/core/src/matrix_coloring/coloring_utils.cu b/core/src/matrix_coloring/coloring_utils.cu index d6f09819..6c0b3a28 100644 --- a/core/src/matrix_coloring/coloring_utils.cu +++ b/core/src/matrix_coloring/coloring_utils.cu @@ -87,7 +87,11 @@ __global__ void permute_colors_kernel(int num_rows, int *row_colors, int *color_ for ( ; row_id < num_rows ; row_id += blockDim.x * gridDim.x ) { int color = row_colors[row_id]; +#if __CUDA_ARCH__ >= 350 color = __ldg(color_permutation + color); +#else + color = color_permutation[color]; +#endif row_colors[row_id] = color; } } diff --git a/core/src/matrix_coloring/greedy_min_max_2ring.cu b/core/src/matrix_coloring/greedy_min_max_2ring.cu index 5df9d46f..286f3137 100644 --- a/core/src/matrix_coloring/greedy_min_max_2ring.cu +++ b/core/src/matrix_coloring/greedy_min_max_2ring.cu @@ -208,6 +208,7 @@ void find_max_neighbor_kernel_and_propagate_used_colors( const int A_num_rows, } //BEGIN: Reduce used_colors/max_hashes amongst subwarps +#if __CUDA_ARCH__ >= 300 #pragma unroll for (int i = WARP_SIZE / 2; i >= 1; i /= 2) @@ -230,6 +231,36 @@ void find_max_neighbor_kernel_and_propagate_used_colors( const int A_num_rows, used_colors |= tmpu; } +#else + __shared__ volatile int s_max_hash[CTA_SIZE + WARP_SIZE / 2]; + __shared__ volatile int s_max_hash_id[CTA_SIZE + WARP_SIZE / 2]; + __shared__ volatile long long s_used_colors[CTA_SIZE + WARP_SIZE / 2]; +#pragma unroll + + for (int i = 1; i <= WARP_SIZE / 2; i *= 2) + { + s_max_hash[threadIdx.x] = max_hash; + s_max_hash_id[threadIdx.x] = max_hash_id; + int tmp = s_max_hash[threadIdx.x + i]; + int tmp_id = s_max_hash_id[threadIdx.x + i]; + + if (lane_id + i < WARP_SIZE) + { + if (tmp_id >= 0 && (max_hash_id < 0 || tmp > max_hash || (tmp == max_hash && tmp_id >= max_hash_id))) + { + max_hash = tmp; + max_hash_id = tmp_id; + } + } + + /// + s_used_colors[threadIdx.x] = used_colors; + long long tmpu = s_used_colors[threadIdx.x + i]; + + if (lane_id + i < WARP_SIZE) { used_colors |= tmpu; } + } + +#endif //END: Reduce used_colors/max_hashes amongst subwarps //The subwarp leader stores the result. @@ -324,6 +355,7 @@ void color_kernel_greedy_onlymax( } //reduce used colors bit by bit. +#if __CUDA_ARCH__ >= 300 #pragma unroll for (int i = WARP_SIZE / 2; i >= 1; i /= 2) @@ -336,6 +368,19 @@ void color_kernel_greedy_onlymax( used_colors |= tmp; } +#else + __shared__ volatile long long s_used_colors[CTA_SIZE + WARP_SIZE / 2]; +#pragma unroll + + for (int i = 1; i <= WARP_SIZE / 2; i *= 2) + { + s_used_colors[threadIdx.x] = used_colors; + long long tmp = s_used_colors[threadIdx.x + i]; + + if (lane_id + i < WARP_SIZE) { used_colors |= tmp; } + } + +#endif int my_color_1 = 64 - utils::bfind( ~used_colors ); if (__popc(used_colors) >= 64 || my_color_1 > 64 || my_color_1 <= 0) @@ -579,6 +624,7 @@ void color_kernel_greedy_gtlt( //is_max_vertex = row_gt_count==0; //is_min_vertex = false; //reduce used colors bit by bit. +#if __CUDA_ARCH__ >= 300 #pragma unroll for (int i = WARP_SIZE / 2; i >= 1; i /= 2) @@ -591,6 +637,19 @@ void color_kernel_greedy_gtlt( used_colors |= tmp; } +#else + __shared__ volatile long long s_used_colors[CTA_SIZE + WARP_SIZE / 2]; +#pragma unroll + + for (int i = 1; i <= WARP_SIZE / 2; i *= 2) + { + s_used_colors[threadIdx.x] = used_colors; + long long tmp = s_used_colors[threadIdx.x + i]; + + if (lane_id + i < WARP_SIZE) { used_colors |= tmp; } + } + +#endif int my_color_1 = 0; int my_color_2 = 0; int free_colors = __popc(used_colors); diff --git a/core/src/matrix_coloring/min_max.cu b/core/src/matrix_coloring/min_max.cu index 23df42cf..960e720f 100644 --- a/core/src/matrix_coloring/min_max.cu +++ b/core/src/matrix_coloring/min_max.cu @@ -199,6 +199,10 @@ find_min_max_neighbors_kernel( const int *__restrict A_offsets, const int NUM_ROWS_PER_CTA = CTA_SIZE / NUM_THREADS_PER_ROW; const int warp_id = threadIdx.x / NUM_THREADS_PER_ROW; const int lane_id = threadIdx.x % NUM_THREADS_PER_ROW; +#if __CUDA_ARCH__ < 300 + __shared__ volatile int s_min_hash[CTA_SIZE]; + __shared__ volatile int s_max_hash[CTA_SIZE]; +#endif for ( int row_id = blockIdx.x * NUM_ROWS_PER_CTA + warp_id ; utils::any(row_id < num_rows) ; row_id += gridDim.x * NUM_ROWS_PER_CTA ) { @@ -255,13 +259,29 @@ find_min_max_neighbors_kernel( const int *__restrict A_offsets, } } +#if __CUDA_ARCH__ < 300 + s_min_hash[threadIdx.x] = min_hash; + s_max_hash[threadIdx.x] = max_hash; +#pragma unroll + + for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 ) + if ( lane_id < offset ) + { + s_min_hash[threadIdx.x] = min_hash = min( min_hash, s_min_hash[threadIdx.x + offset] ); + s_max_hash[threadIdx.x] = max_hash = max( max_hash, s_max_hash[threadIdx.x + offset] ); + } + +#else #pragma unroll + for ( int mask = NUM_THREADS_PER_ROW / 2 ; mask > 0 ; mask >>= 1 ) { min_hash = min( min_hash, utils::shfl_xor( min_hash, mask ) ); max_hash = max( max_hash, utils::shfl_xor( max_hash, mask ) ); } +#endif + if ( row_id < num_rows && lane_id == 0 ) { max_hash_array[row_id] = max_hash; diff --git a/core/src/matrix_coloring/min_max_2ring.cu b/core/src/matrix_coloring/min_max_2ring.cu index ee56cc0f..6f2776e6 100644 --- a/core/src/matrix_coloring/min_max_2ring.cu +++ b/core/src/matrix_coloring/min_max_2ring.cu @@ -654,7 +654,9 @@ void color_kernel_greedy( const int A_num_rows, const int *A_rows, const int *A_ is_min_vertex = false; //reduce used colors bit by bit. +#if __CUDA_ARCH__ >= 350 #pragma unroll + for (int i = WARP_SIZE / 2; i >= 1; i /= 2) { int tmp_hi = __double2hiint( __longlong_as_double( used_colors ) ); @@ -665,6 +667,7 @@ void color_kernel_greedy( const int A_num_rows, const int *A_rows, const int *A_ used_colors |= tmp; } +#endif int my_color = 64 - utils::bfind( ~used_colors ); if (my_color <= 0) { my_color = 1; } diff --git a/core/src/scalers/nbinormalization.cu b/core/src/scalers/nbinormalization.cu index 4af53de6..2eba79a7 100644 --- a/core/src/scalers/nbinormalization.cu +++ b/core/src/scalers/nbinormalization.cu @@ -169,10 +169,16 @@ void computeBetaGammaDevice(IndexType rows, IndexType *offsets, IndexType *indic const int vectors_per_block = VectorsPerCTA; const int vector_id = threadIdx.x / VectorSize; const int lane_id = threadIdx.x % VectorSize; +#if __CUDA_ARCH__ >= 300 +#else + __shared__ IndexType row_offsets[2 * VectorsPerCTA]; + __shared__ volatile VectorValue bi_sum[CTASize + VectorSize / 2]; +#endif for (int i = vectors_per_block * blockIdx.x + vector_id; i < rows; i += vectors_per_block * gridDim.x) { // load start + end pointers +#if __CUDA_ARCH__ >= 300 int row_tmp; if (lane_id < 2) @@ -183,6 +189,16 @@ void computeBetaGammaDevice(IndexType rows, IndexType *offsets, IndexType *indic // distribute to all other threads in warp int row_begin = utils::shfl(row_tmp, vector_id * VectorSize, warpSize, utils::activemask()); int row_end = utils::shfl(row_tmp, vector_id * VectorSize + 1, warpSize, utils::activemask()); +#else + + if (lane_id < 2) + { + row_offsets[2 * vector_id + lane_id] = offsets[i + lane_id]; + } + + int row_begin = row_offsets[2 * vector_id]; + int row_end = row_offsets[2 * vector_id + 1]; +#endif VectorValue bi(0.); for (int jj = row_begin + lane_id; utils::any(jj < row_end, utils::activemask()); jj += VectorSize) @@ -200,7 +216,12 @@ void computeBetaGammaDevice(IndexType rows, IndexType *offsets, IndexType *indic } // reduce over bi +#if __CUDA_ARCH__ >= 300 VectorValue bi_s = warpReduceSum(bi); +#else + bi_sum[threadIdx.x] = bi; + VectorValue bi_s = warpReduceSumShared(&bi_sum[vector_id * VectorSize], lane_id); +#endif if (lane_id == 0) { @@ -218,10 +239,15 @@ void computeGammaDevice(int rows, IndexType *offsets, IndexType *indices, Matrix const int vectors_per_block = CTASize / VectorSize; const int vector_id = threadIdx.x / VectorSize; const int lane_id = threadIdx.x % VectorSize; +#if __CUDA_ARCH__ >= 300 +#else + __shared__ int row_offsets[2 * (CTASize / VectorSize)]; +#endif for (int i = vectors_per_block * blockIdx.x + vector_id; i < rows; i += vectors_per_block * gridDim.x) { // load start + end pointers +#if __CUDA_ARCH__ >= 300 int row_tmp; if (lane_id < 2) @@ -232,6 +258,16 @@ void computeGammaDevice(int rows, IndexType *offsets, IndexType *indices, Matrix // distribute to all other threads in warp int row_begin = utils::shfl(row_tmp, vector_id * VectorSize, warpSize, utils::activemask()); int row_end = utils::shfl(row_tmp, vector_id * VectorSize + 1, warpSize, utils::activemask()); +#else + + if (lane_id < 2) + { + row_offsets[2 * vector_id + lane_id] = offsets[i + lane_id]; + } + + int row_begin = row_offsets[2 * vector_id]; + int row_end = row_offsets[2 * vector_id + 1]; +#endif for (int jj = row_begin + lane_id; utils::any(jj < row_end, utils::activemask()); jj += VectorSize) { @@ -257,10 +293,16 @@ void computeBetaDevice(int rows, IndexType *offsets, IndexType *indices, MatrixV const int vectors_per_block = CTASize / VectorSize; const int vector_id = threadIdx.x / VectorSize; const int lane_id = threadIdx.x % VectorSize; +#if __CUDA_ARCH__ >= 300 +#else + __shared__ int row_offsets[2 * (CTASize / VectorSize)]; + __shared__ volatile VectorValue bi_sum[CTASize + VectorSize / 2]; +#endif for (int i = vectors_per_block * blockIdx.x + vector_id; i < rows; i += vectors_per_block * gridDim.x) { // load start + end pointers +#if __CUDA_ARCH__ >= 300 int row_tmp; if (lane_id < 2) @@ -271,6 +313,16 @@ void computeBetaDevice(int rows, IndexType *offsets, IndexType *indices, MatrixV // distribute to all other threads in warp int row_begin = utils::shfl(row_tmp, vector_id * VectorSize, warpSize, utils::activemask()); int row_end = utils::shfl(row_tmp, vector_id * VectorSize + 1, warpSize, utils::activemask()); +#else + + if (lane_id < 2) + { + row_offsets[2 * vector_id + lane_id] = offsets[i + lane_id]; + } + + int row_begin = row_offsets[2 * vector_id]; + int row_end = row_offsets[2 * vector_id + 1]; +#endif VectorValue bi = 0.; for (int jj = row_begin + lane_id; utils::any(jj < row_end, utils::activemask()); jj += VectorSize) @@ -287,7 +339,12 @@ void computeBetaDevice(int rows, IndexType *offsets, IndexType *indices, MatrixV } // reduce over bi +#if __CUDA_ARCH__ >= 300 VectorValue bi_s = warpReduceSum(bi); +#else + bi_sum[threadIdx.x] = bi; + VectorValue bi_s = warpReduceSumShared(&bi_sum[vector_id * VectorSize], lane_id); +#endif if (lane_id == 0) { diff --git a/core/src/solvers/dense_lu_solver.cu b/core/src/solvers/dense_lu_solver.cu index fefbd822..2fc6d8b1 100644 --- a/core/src/solvers/dense_lu_solver.cu +++ b/core/src/solvers/dense_lu_solver.cu @@ -136,6 +136,7 @@ void csr_to_dense_kernel( } +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 template< int N, bool ROW_MAJOR, int WARP_SIZE, typename Value_type > static __device__ __forceinline__ Value_type reduce_distributed_vectors( Value_type x, int is_leader, unsigned int active_mask ) @@ -166,16 +167,53 @@ Value_type reduce_distributed_vectors( Value_type x, int is_leader, unsigned int return x; } +#else +template< int N, bool ROW_MAJOR, int WARP_SIZE, typename Value_type > +static __device__ __forceinline__ +Value_type reduce_distributed_vectors( volatile Value_type *s_mem, Value_type x, int is_leader ) +{ + if ( N & (N - 1) ) + { +#pragma unroll + + for ( int i = 1 ; i < N ; ++i ) + { + const int offset = ROW_MAJOR ? i : N * i; + + if ( is_leader && utils::lane_id() < WARP_SIZE - offset ) + { + s_mem[threadIdx.x] = x += s_mem[threadIdx.x + offset]; + } + } + } + else + { +#pragma unroll + + for ( int i = 1 ; i < N ; i <<= 1 ) + { + const int offset = ROW_MAJOR ? i : N * i; + + if ( utils::lane_id() < WARP_SIZE - offset ) + { + s_mem[threadIdx.x] = x += s_mem[threadIdx.x + offset]; + } + } + } + + return x; +} +#endif template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE, int WARP_SIZE, bool ROW_MAJOR, bool HAS_EXTERNAL_DIAG > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void b_minus_A_halo_x( const int *__restrict A_rows, const int *__restrict A_cols, @@ -204,8 +242,17 @@ void b_minus_A_halo_x( const int *__restrict A_rows, // Useful index to compute matrix products. const int lane_id_mod_NxN_div_N = lane_id_mod_NxN / N; const int lane_id_mod_NxN_mod_N = lane_id_mod_NxN % N; +#if __CUDA_ARCH__ < 300 + // Shared memory to broadcast column IDs. + __shared__ volatile int s_a_col_ids[CTA_SIZE]; + __shared__ volatile int s_a_col_is_valid[CTA_SIZE]; + // Each thread keeps its own pointer. + volatile int *my_s_a_col_ids = &s_a_col_ids[threadIdx.x - lane_id_mod_NxN]; + volatile int *my_s_a_col_is_valid = &s_a_col_is_valid[threadIdx.x - lane_id_mod_NxN]; +#else // We to get my data from when I use SHFL. const int shfl_offset = lane_id - lane_id_mod_NxN; +#endif // Shared memory needed to exchange X and delta. __shared__ volatile Vector_type s_mem[CTA_SIZE]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -273,6 +320,10 @@ void b_minus_A_halo_x( const int *__restrict A_rows, // Determine if the column is halo column int a_col_is_valid = (a_col_id != -1) && (a_col_id >= num_owned_rows); +#if __CUDA_ARCH__ < 300 + my_s_a_col_ids[lane_id_mod_NxN] = a_col_id; + my_s_a_col_is_valid[lane_id_mod_NxN] = a_col_is_valid; +#endif // Count the number of active columns. // int vote = __ballot(aColId != -1); // The number of iterations. @@ -283,8 +334,13 @@ void b_minus_A_halo_x( const int *__restrict A_rows, { int my_k = k + lane_id_mod_NxN_div_N; // Load N blocks of X. +#if __CUDA_ARCH__ < 300 + int uniform_a_col_id = my_s_a_col_ids[my_k]; + int uniform_a_col_is_valid = my_s_a_col_is_valid[my_k]; +#else int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k, WARP_SIZE, active_mask); int uniform_a_col_is_valid = utils::shfl( a_col_is_valid, shfl_offset + my_k, WARP_SIZE, active_mask ); +#endif Vector_type my_x(0); if ( uniform_a_col_id != -1 && uniform_a_col_is_valid) @@ -334,7 +390,12 @@ void b_minus_A_halo_x( const int *__restrict A_rows, is_leader = lane_id_mod_NxN_mod_N == 0; } +#if __CUDA_ARCH__ >= 300 my_bmAx = reduce_distributed_vectors( my_bmAx, is_leader, active_mask ); +#else + s_mem[threadIdx.x] = my_bmAx; + my_bmAx = reduce_distributed_vectors( s_mem, my_bmAx, is_leader ); +#endif // Update the shared terms. if ( ROW_MAJOR ) @@ -363,7 +424,12 @@ void b_minus_A_halo_x( const int *__restrict A_rows, } // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 my_bmAx = reduce_distributed_vectors( my_bmAx, is_leader, active_mask ); +#else + s_mem[threadIdx.x] = my_bmAx; + my_bmAx = reduce_distributed_vectors( s_mem, my_bmAx, is_leader ); +#endif // Store the results. if ( ROW_MAJOR ) diff --git a/core/src/solvers/kaczmarz_solver.cu b/core/src/solvers/kaczmarz_solver.cu index aad6ab89..f423a384 100644 --- a/core/src/solvers/kaczmarz_solver.cu +++ b/core/src/solvers/kaczmarz_solver.cu @@ -264,6 +264,9 @@ __global__ void kaczmarz_smooth_kernel_warp_atomics(const IndexType num_rows, { const int num_warps = kCtaSize / 32; const int num_rows_per_iter = num_warps * gridDim.x; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + __shared__ volatile ValueTypeB smem[kCtaSize]; +#endif const int warpId = threadIdx.x / 32; const int laneId = threadIdx.x % 32; @@ -279,7 +282,13 @@ __global__ void kaczmarz_smooth_kernel_warp_atomics(const IndexType num_rows, { ValueTypeB aValue = j < row_end ? Ax[j] : ValueTypeB(0); ValueTypeB xValue = j < row_end ? xout[Aj[j]] : ValueTypeB(0); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 r = utils::warp_reduce<1, utils::Add>(aValue * xValue); +#endif + //#else +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + r = utils::warp_reduce<1, utils::Add>(smem, aValue * xValue); +#endif Axi += r; } @@ -312,6 +321,9 @@ __global__ void randomized_kaczmarz_smooth_kernel_warp_atomics(const IndexType n { const int num_warps = kCtaSize / 32; const int num_rows_per_iter = num_warps * gridDim.x; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + __shared__ volatile ValueTypeB smem[kCtaSize]; +#endif const int warpId = threadIdx.x / 32; const int laneId = threadIdx.x % 32; @@ -330,8 +342,15 @@ __global__ void randomized_kaczmarz_smooth_kernel_warp_atomics(const IndexType n { ValueTypeB aValue = j < row_end ? Ax[j] : ValueTypeB(0); ValueTypeB xValue = j < row_end ? xout[Aj[j]] : ValueTypeB(0); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 r = utils::warp_reduce<1, utils::Add>(aValue * xValue); aa = utils::warp_reduce<1, utils::Add>(aValue * aValue); +#endif + //#else +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + r = utils::warp_reduce<1, utils::Add>(smem, aValue * xValue); + aa = utils::warp_reduce<1, utils::Add>(smem, aValue * aValue); +#endif Axi += r; AA += aa; } @@ -367,6 +386,9 @@ __global__ void kaczmarz_smooth_kernel(const IndexType num_rows, IndexType i, t; const int num_warps = kCtaSize / 32; const int num_rows_per_iter = num_warps * gridDim.x; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + __shared__ volatile ValueTypeB smem[kCtaSize]; +#endif const int warpId = threadIdx.x / 32; const int laneId = threadIdx.x % 32; @@ -383,7 +405,13 @@ __global__ void kaczmarz_smooth_kernel(const IndexType num_rows, { ValueTypeB aValue = j < row_end ? Ax[j] : ValueTypeB(0); ValueTypeB xValue = j < row_end ? xout[Aj[j]] : ValueTypeB(0); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 r = utils::warp_reduce<1, utils::Add>(aValue * xValue); +#endif + //#else +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + r = utils::warp_reduce<1, utils::Add>(smem, aValue * xValue); +#endif Axi += r; //Axi += utils::Warp_reduce_linear<1,32>::execute(aValue * xValue); //Axi += Ax[j] * xout[Aj[j]]; @@ -456,6 +484,9 @@ __global__ void multicolor_kaczmarz_smooth_kernel(const IndexType num_rows, { const int num_warps = kCtaSize / 32; const int num_rows_per_iter = num_warps * gridDim.x; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + __shared__ volatile ValueTypeB smem[kCtaSize]; +#endif const int warpId = threadIdx.x / 32; const int laneId = threadIdx.x % 32; int i; @@ -473,7 +504,13 @@ __global__ void multicolor_kaczmarz_smooth_kernel(const IndexType num_rows, { ValueTypeB aValue = j < row_end ? Ax[j] : ValueTypeB(0); ValueTypeB xValue = j < row_end ? xout[Aj[j]] : ValueTypeB(0); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 r = utils::warp_reduce<1, utils::Add>(aValue * xValue); + //#else +#endif +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + r = utils::warp_reduce<1, utils::Add>(smem, aValue * xValue); +#endif Axi += r; } diff --git a/core/src/solvers/multicolor_dilu_solver.cu b/core/src/solvers/multicolor_dilu_solver.cu index fc741620..be69e2a4 100644 --- a/core/src/solvers/multicolor_dilu_solver.cu +++ b/core/src/solvers/multicolor_dilu_solver.cu @@ -61,10 +61,10 @@ enum { CTA_SIZE = 128, WARP_SIZE = 32 }; template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE, int WARP_SIZE, int NUM_WARP_ITERS_PER_BLOCK > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_setup_NxN_kernel_large( const int *__restrict A_rows, const int *__restrict A_cols, @@ -390,10 +390,10 @@ void DILU_setup_NxN_kernel_large( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE, int WARP_SIZE > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_setup_NxN_kernel( const int *__restrict A_rows, const int *__restrict A_cols, @@ -671,10 +671,10 @@ void DILU_setup_NxN_kernel( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, int NUM_THREADS_PER_ROW, int CTA_SIZE, int WARP_SIZE > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 16 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 16 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_setup_1x1_kernel( const int *__restrict A_rows, const int *__restrict A_cols, @@ -703,6 +703,10 @@ void DILU_setup_1x1_kernel( const int *__restrict A_rows, __shared__ volatile int s_A_ji[CTA_SIZE]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. volatile int *my_s_A_ji = &s_A_ji[warp_id * WARP_SIZE]; + // Shared memory to compute a reduction (not needed for >= SM30). +#if __CUDA_ARCH__ < 300 + __shared__ volatile Matrix_type s_mem[CTA_SIZE]; +#endif // Determine which NxN block the threads work with. int a_row_it = blockIdx.x * NUM_WARPS_PER_CTA + warp_id; @@ -819,12 +823,26 @@ void DILU_setup_1x1_kernel( const int *__restrict A_rows, } // current_color != 0 // Reduce the e_outs in one value. +#if __CUDA_ARCH__ >= 300 #pragma unroll + for ( int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1 ) { e_out += utils::shfl_xor( e_out, mask ); } +#else + s_mem[threadIdx.x] = e_out; +#pragma unroll + + for ( int offset = WARP_SIZE / 2 ; offset > 0 ; offset >>= 1 ) + if ( lane_id < offset ) + { + s_mem[threadIdx.x] = e_out += s_mem[threadIdx.x + offset]; + } + +#endif + // Store the result. if ( lane_id == 0 ) { @@ -842,6 +860,7 @@ void DILU_setup_1x1_kernel( const int *__restrict A_rows, /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 template< int N, bool ROW_MAJOR, int WARP_SIZE, typename Value_type > static __device__ __forceinline__ Value_type reduce_distributed_vectors( Value_type x, int is_leader ) @@ -872,15 +891,54 @@ Value_type reduce_distributed_vectors( Value_type x, int is_leader ) return x; } +//#endif +//#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 +#else +template< int N, bool ROW_MAJOR, int WARP_SIZE, typename Value_type > +static __device__ __forceinline__ +Value_type reduce_distributed_vectors( volatile Value_type *s_mem, Value_type x, int is_leader ) +{ + if ( N & (N - 1) ) + { +#pragma unroll + + for ( int i = 1 ; i < N ; ++i ) + { + const int offset = ROW_MAJOR ? i : N * i; + + if ( is_leader && utils::lane_id() < WARP_SIZE - offset ) + { + s_mem[threadIdx.x] = x += s_mem[threadIdx.x + offset]; + } + } + } + else + { +#pragma unroll + + for ( int i = 1 ; i < N ; i <<= 1 ) + { + const int offset = ROW_MAJOR ? i : N * i; + + if ( utils::lane_id() < WARP_SIZE - offset ) + { + s_mem[threadIdx.x] = x += s_mem[threadIdx.x + offset]; + } + } + } + + return x; +} +#endif /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE, int WARP_SIZE, bool ROW_MAJOR, bool HAS_EXTERNAL_DIAG > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_forward_NxN_kernel( const int *__restrict A_rows, const int *__restrict A_cols, @@ -915,8 +973,17 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows, // Useful index to compute matrix products. const int lane_id_mod_NxN_div_N = lane_id_mod_NxN / N; const int lane_id_mod_NxN_mod_N = lane_id_mod_NxN % N; +#if __CUDA_ARCH__ < 300 + // Shared memory to broadcast column IDs. + __shared__ volatile int s_a_col_ids[CTA_SIZE]; + __shared__ volatile int s_a_col_is_valid[CTA_SIZE]; + // Each thread keeps its own pointer. + volatile int *my_s_a_col_ids = &s_a_col_ids[threadIdx.x - lane_id_mod_NxN]; + volatile int *my_s_a_col_is_valid = &s_a_col_is_valid[threadIdx.x - lane_id_mod_NxN]; +#else // We to get my data from when I use SHFL. const int shfl_offset = lane_id - lane_id_mod_NxN; +#endif // Shared memory needed to exchange X and delta. __shared__ volatile Vector_type s_mem[CTA_SIZE]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -993,6 +1060,7 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows, // Determine if the color is valid. int a_col_is_valid = false; #ifdef AMGX_ILU_COLORING + if ( a_col_id != -1 && current_color != 0 ) { if ( boundary_coloring == FIRST ) @@ -1012,6 +1080,10 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows, a_col_is_valid = row_colors[a_col_id] < current_color; } +#endif +#if __CUDA_ARCH__ < 300 + my_s_a_col_ids[lane_id_mod_NxN] = a_col_id; + my_s_a_col_is_valid[lane_id_mod_NxN] = a_col_is_valid; #endif // Count the number of active columns. // int vote = utils::ballot(aColId != -1); @@ -1023,8 +1095,13 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows, { int my_k = k + lane_id_mod_NxN_div_N; // Load N blocks of X. +#if __CUDA_ARCH__ < 300 + int uniform_a_col_id = my_s_a_col_ids[my_k]; + int uniform_a_col_is_valid = my_s_a_col_is_valid[my_k]; +#else int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k ); int uniform_a_col_is_valid = utils::shfl( a_col_is_valid, shfl_offset + my_k ); +#endif Vector_type my_x(0); if ( uniform_a_col_id != -1 ) @@ -1084,7 +1161,12 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows, is_leader = lane_id_mod_NxN_mod_N == 0; } +#if __CUDA_ARCH__ >= 300 my_bmAx = reduce_distributed_vectors( my_bmAx, is_leader ); +#else + s_mem[threadIdx.x] = my_bmAx; + my_bmAx = reduce_distributed_vectors( s_mem, my_bmAx, is_leader ); +#endif // Update the shared terms. if ( ROW_MAJOR ) @@ -1113,7 +1195,12 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows, } // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 my_bmAx = reduce_distributed_vectors( my_bmAx, is_leader ); +#else + s_mem[threadIdx.x] = my_bmAx; + my_bmAx = reduce_distributed_vectors( s_mem, my_bmAx, is_leader ); +#endif // Store the results. if ( ROW_MAJOR ) @@ -1135,10 +1222,10 @@ void DILU_forward_NxN_kernel( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, int N, int CTA_SIZE, int WARP_SIZE, bool HAS_EXTERNAL_DIAG, int NUM_WARP_ITERS_PER_BLOCK > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_forward_NxN_kernel_large( const int *__restrict A_rows, const int *__restrict A_cols, @@ -1382,10 +1469,10 @@ void DILU_forward_NxN_kernel_large( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, int CTA_SIZE, int WARP_SIZE, bool ROW_MAJOR, bool HAS_EXTERNAL_DIAG > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_forward_4x4_kernel( const int *__restrict A_rows, const int *__restrict A_cols, @@ -1417,8 +1504,17 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows, // Useful index to compute matrix products. const int lane_id_mod_16_div_4 = lane_id_mod_16 / 4; const int lane_id_mod_16_mod_4 = lane_id_mod_16 % 4; +#if __CUDA_ARCH__ < 300 + // Shared memory to broadcast column IDs. + __shared__ volatile int s_a_col_ids[CTA_SIZE]; + __shared__ volatile int s_a_col_is_valid[CTA_SIZE]; + // Each thread keeps its own pointer. + volatile int *my_s_a_col_ids = &s_a_col_ids[threadIdx.x - lane_id_mod_16]; + volatile int *my_s_a_col_is_valid = &s_a_col_is_valid[threadIdx.x - lane_id_mod_16]; +#else // We to get my data from when I use SHFL. const int shfl_offset = lane_id - lane_id_mod_16; +#endif // Shared memory needed to exchange X and delta. __shared__ volatile Vector_type s_mem[CTA_SIZE]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -1500,6 +1596,10 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows, a_col_is_valid = row_colors[a_col_id] < current_color; } +#endif +#if __CUDA_ARCH__ < 300 + my_s_a_col_ids[lane_id_mod_16] = a_col_id; + my_s_a_col_is_valid[lane_id_mod_16] = a_col_is_valid; #endif // Count the number of active columns. // int vote = utils::ballot(aColId != -1); @@ -1511,8 +1611,13 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows, { int my_k = k + lane_id_mod_16_div_4; // Load N blocks of X. +#if __CUDA_ARCH__ < 300 + int uniform_a_col_id = my_s_a_col_ids[my_k]; + int uniform_a_col_is_valid = my_s_a_col_is_valid[my_k]; +#else int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k ); int uniform_a_col_is_valid = utils::shfl( a_col_is_valid, shfl_offset + my_k ); +#endif Vector_type my_x(0); if ( uniform_a_col_id != -1 ) @@ -1572,7 +1677,12 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows, is_leader = lane_id_mod_16_mod_4 == 0; } +#if __CUDA_ARCH__ >= 300 my_bmAx = reduce_distributed_vectors<4, ROW_MAJOR, WARP_SIZE>( my_bmAx, is_leader ); +#else + s_mem[threadIdx.x] = my_bmAx; + my_bmAx = reduce_distributed_vectors<4, ROW_MAJOR, WARP_SIZE>( s_mem, my_bmAx, is_leader ); +#endif // Update the shared terms. if ( ROW_MAJOR ) @@ -1601,7 +1711,12 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows, } // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 my_bmAx = reduce_distributed_vectors<4, ROW_MAJOR, WARP_SIZE>( my_bmAx, is_leader ); +#else + s_mem[threadIdx.x] = my_bmAx; + my_bmAx = reduce_distributed_vectors<4, ROW_MAJOR, WARP_SIZE>( s_mem, my_bmAx, is_leader ); +#endif // Store the results. if ( ROW_MAJOR ) @@ -1625,10 +1740,10 @@ void DILU_forward_4x4_kernel( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, int CTA_SIZE, bool HAS_EXTERNAL_DIAG > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, const int *__restrict A_cols, @@ -1656,7 +1771,16 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, // Useful constants. const int thread_id_mod_16_div_4 = thread_id_mod_16 / 4; const int thread_id_mod_16_mod_4 = thread_id_mod_16 % 4; +#if __CUDA_ARCH__ < 300 + // Shared memory to broadcast column IDs. + __shared__ volatile int s_a_col_ids[CTA_SIZE]; + __shared__ volatile int s_a_col_is_valid[CTA_SIZE]; + // Each thread keeps its own pointer. + volatile int *my_s_a_col_ids = &s_a_col_ids[16 * thread_id_div_16]; + volatile int *my_s_a_col_is_valid = &s_a_col_is_valid[16 * thread_id_div_16]; +#else const int shfl_offset = 16 * (lane_id / 16); +#endif // Shared memory needed to exchange X and delta. __shared__ volatile Vector_type s_mem[CTA_SIZE]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -1727,6 +1851,10 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, a_col_is_valid = row_colors[a_col_id] < current_color; } +#endif +#if __CUDA_ARCH__ < 300 + my_s_a_col_ids[thread_id_mod_16] = a_col_id; + my_s_a_col_is_valid[thread_id_mod_16] = a_col_is_valid; #endif // Loop over columns. We compute 8 columns per iteration. @@ -1734,8 +1862,13 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, { int my_k = k + thread_id_mod_16_div_4; // Load 8 blocks of X. +#if __CUDA_ARCH__ < 300 + int uniform_a_col_id = my_s_a_col_ids[my_k]; + int uniform_a_col_is_valid = my_s_a_col_is_valid[my_k]; +#else int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k ); int uniform_a_col_is_valid = utils::shfl( a_col_is_valid, shfl_offset + my_k ); +#endif Vector_type my_x(0); if ( uniform_a_col_id != -1 ) @@ -1778,8 +1911,17 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, // Load Einvs. Matrix_type my_Einv = Einv[16 * a_row_id + thread_id_mod_16]; // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 my_bmAx += utils::shfl_xor( my_bmAx, 4 ); my_bmAx += utils::shfl_xor( my_bmAx, 8 ); +#else + s_mem[threadIdx.x] = my_bmAx; + + if ( lane_id < 28 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 4]; } + + if ( lane_id < 24 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 8]; } + +#endif // Update the shared terms. if ( thread_id_mod_16_div_4 == 0 ) @@ -1790,8 +1932,17 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, // Update the diagonal term. my_bmAx = my_Einv * my_s_mem[thread_id_mod_16_mod_4]; // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 my_bmAx += utils::shfl_xor( my_bmAx, 1 ); my_bmAx += utils::shfl_xor( my_bmAx, 2 ); +#else + s_mem[threadIdx.x] = my_bmAx; + + if ( lane_id < 31 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 1]; } + + if ( lane_id < 30 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 2]; } + +#endif // Store the results. if ( thread_id_mod_16_mod_4 == 0 ) @@ -1805,10 +1956,10 @@ void DILU_forward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, int NUM_THREADS_PER_ROW, int CTA_SIZE, int WARP_SIZE, bool HAS_EXTERNAL_DIAG > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_forward_1x1_kernel( const int *__restrict A_rows, const int *__restrict A_cols, @@ -1834,6 +1985,10 @@ void DILU_forward_1x1_kernel( const int *__restrict A_rows, const int lane_id = utils::lane_id(); // Constants. const int lane_id_mod_NTPR = lane_id % NUM_THREADS_PER_ROW; +#if __CUDA_ARCH__ < 300 + // Shared memory needed to exchange X and delta. + __shared__ volatile Vector_type s_mem[CTA_SIZE]; +#endif // Determine which NxN block the threads work with. int a_row_it = blockIdx.x * NUM_ROWS_PER_CTA + (threadIdx.x / NUM_THREADS_PER_ROW); @@ -1932,6 +2087,7 @@ void DILU_forward_1x1_kernel( const int *__restrict A_rows, } // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 #pragma unroll for ( int mask = NUM_THREADS_PER_ROW / 2 ; mask > 0 ; mask >>= 1 ) @@ -1939,6 +2095,18 @@ void DILU_forward_1x1_kernel( const int *__restrict A_rows, my_bmAx += utils::shfl_xor( my_bmAx, mask ); } +#else + s_mem[threadIdx.x] = my_bmAx; +#pragma unroll + + for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 ) + if ( lane_id_mod_NTPR < offset ) + { + s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + offset]; + } + +#endif + // Store the results. if ( lane_id_mod_NTPR == 0 ) { @@ -1951,10 +2119,10 @@ void DILU_forward_1x1_kernel( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, typename WeightType, int N, int CTA_SIZE, int WARP_SIZE, bool ROW_MAJOR > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_backward_NxN_kernel( const int *__restrict A_rows, const int *__restrict A_cols, @@ -1989,8 +2157,15 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows, // Useful index to compute matrix products. const int lane_id_mod_NxN_div_N = lane_id_mod_NxN / N; const int lane_id_mod_NxN_mod_N = lane_id_mod_NxN % N; +#if __CUDA_ARCH__ < 300 + // Shared memory to broadcast column IDs. + __shared__ volatile int s_a_col_ids[CTA_SIZE]; + // Each thread keeps its own pointer. + volatile int *my_s_a_col_ids = &s_a_col_ids[threadIdx.x - lane_id_mod_NxN]; +#else // We to get my data from when I use SHFL. const int shfl_offset = lane_id - lane_id_mod_NxN; +#endif // Shared memory needed to exchange X and delta. __shared__ volatile Vector_type s_mem[CTA_SIZE]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -2068,6 +2243,10 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows, a_col_id = a_col_tmp; } +#if __CUDA_ARCH__ < 300 + my_s_a_col_ids[lane_id_mod_NxN] = a_col_id; +#endif + // Count the number of active columns. // int vote = utils::ballot(aColId != -1); // The number of iterations. @@ -2078,7 +2257,11 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows, { int my_k = k + lane_id_mod_NxN_div_N; // Load N blocks of X. +#if __CUDA_ARCH__ < 300 + int uniform_a_col_id = my_s_a_col_ids[my_k]; +#else int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k ); +#endif Vector_type my_x(0); if ( uniform_a_col_id != -1 ) @@ -2130,7 +2313,12 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows, is_leader = lane_id_mod_NxN_mod_N == 0; } +#if __CUDA_ARCH__ >= 300 my_delta = reduce_distributed_vectors( my_delta, is_leader ); +#else + s_mem[threadIdx.x] = my_delta; + my_delta = reduce_distributed_vectors( s_mem, my_delta, is_leader ); +#endif // Update the shared terms. if ( ROW_MAJOR ) @@ -2159,7 +2347,12 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows, } // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 my_delta = reduce_distributed_vectors( my_delta, is_leader ); +#else + s_mem[threadIdx.x] = my_delta; + my_delta = reduce_distributed_vectors( s_mem, my_delta, is_leader ); +#endif // Store the results. if ( ROW_MAJOR ) @@ -2209,10 +2402,10 @@ void DILU_backward_NxN_kernel( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, typename WeightType, int N, int CTA_SIZE, int WARP_SIZE, bool ROW_MAJOR, int NUM_WARP_ITERS_PER_BLOCK > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_backward_NxN_kernel_large( const int *__restrict A_rows, const int *__restrict A_cols, @@ -2451,10 +2644,10 @@ void DILU_backward_NxN_kernel_large( const int *__restrict A_rows, template< typename IndexType, typename ValueTypeA, typename ValueTypeB, typename WeightType, int CTA_SIZE, bool ROW_MAJOR > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 16 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 16 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_backward_4x4_kernel( const IndexType *row_offsets, const IndexType *column_indices, @@ -2477,7 +2670,14 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets, const int halfLaneId = threadIdx.x % 16; const int halfLaneId_div_4 = halfLaneId / 4; const int halfLaneId_mod_4 = halfLaneId % 4; +#if __CUDA_ARCH__ < 300 + // Shared memory to broadcast column IDs. + __shared__ volatile int s_aColIds[CTA_SIZE]; + // Each thread keeps its own pointer. + volatile int *my_s_aColIds = &s_aColIds[16 * halfWarpId]; +#else const int upperHalf = 16 * (laneId / 16); +#endif // Shared memory needed to exchange X and delta. __shared__ volatile ValueTypeB s_mem[CTA_SIZE]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -2520,12 +2720,19 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets, aColId = aColTmp; } +#endif +#if __CUDA_ARCH__ < 300 + my_s_aColIds[halfLaneId] = aColId; #endif for ( int k = 0 ; k < 16 ; k += 4 ) { int my_k = k + halfLaneId_div_4; // Exchange column indices. +#if __CUDA_ARCH__ < 300 + int waColId = my_s_aColIds[my_k]; +#else int waColId = utils::shfl( aColId, upperHalf + my_k ); +#endif // Load 8 blocks of X if needed. ValueTypeB my_x(0); @@ -2542,7 +2749,12 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets, { const int k_i = k + i; int w_aColTmp = aColBegin + k_i, w_aColIt = -1; +#if __CUDA_ARCH__ < 300 + + if ( my_s_aColIds[k_i] != -1 && w_aColTmp < aColEnd ) +#else if ( utils::shfl( aColId, upperHalf + k_i ) != -1 && w_aColTmp < aColEnd ) +#endif w_aColIt = w_aColTmp; ValueTypeA my_val(0); @@ -2567,6 +2779,8 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets, // Load EINV values. ValueTypeA my_Einv = Einv[16 * aRowId + halfLaneId]; // Reduce delta terms. +#if __CUDA_ARCH__ >= 300 + if ( ROW_MAJOR ) { my_delta += utils::shfl_xor( my_delta, 1 ); @@ -2578,6 +2792,24 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets, my_delta += utils::shfl_xor( my_delta, 8 ); } +#else + s_mem[threadIdx.x] = my_delta; + + if ( ROW_MAJOR ) + { + if ( laneId < 31 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 1]; } + + if ( laneId < 30 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 2]; } + } + else + { + if ( laneId < 28 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 4]; } + + if ( laneId < 24 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 8]; } + } + +#endif + // Update the shared terms. if ( ROW_MAJOR ) { @@ -2605,6 +2837,8 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets, } // Regroup results. +#if __CUDA_ARCH__ >= 300 + if ( ROW_MAJOR ) { my_delta += utils::shfl_xor( my_delta, 1 ); @@ -2616,6 +2850,24 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets, my_delta += utils::shfl_xor( my_delta, 8 ); } +#else + s_mem[threadIdx.x] = my_delta; + + if ( ROW_MAJOR ) + { + if ( laneId < 31 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 1]; } + + if ( laneId < 30 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 2]; } + } + else + { + if ( laneId < 28 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 4]; } + + if ( laneId < 24 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 8]; } + } + +#endif + // Store the results. if ( ROW_MAJOR ) { @@ -2662,10 +2914,10 @@ void DILU_backward_4x4_kernel( const IndexType *row_offsets, template< typename Matrix_type, typename Vector_type, typename WeightType, int CTA_SIZE > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 16 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 16 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, const int *__restrict A_cols, @@ -2693,7 +2945,14 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, // Useful constants. const int thread_id_mod_16_div_4 = thread_id_mod_16 / 4; const int thread_id_mod_16_mod_4 = thread_id_mod_16 % 4; +#if __CUDA_ARCH__ < 300 + // Shared memory to broadcast column IDs. + __shared__ volatile int s_a_col_ids[CTA_SIZE]; + // Each thread keeps its own pointer. + volatile int *my_s_a_col_ids = &s_a_col_ids[16 * thread_id_div_16]; +#else const int shfl_offset = 16 * (lane_id / 16); +#endif // Shared memory needed to exchange X and delta. __shared__ volatile Vector_type s_mem[CTA_SIZE]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -2754,6 +3013,9 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, a_col_id = -1; } +#if __CUDA_ARCH__ < 300 + my_s_a_col_ids[thread_id_mod_16] = a_col_id; +#endif // Loop over columns. We compute 8 columns per iteration. #pragma unroll 2 @@ -2761,7 +3023,11 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, { int my_k = k + thread_id_mod_16_div_4; // Load 8 blocks of X. +#if __CUDA_ARCH__ < 300 + int uniform_a_col_id = my_s_a_col_ids[my_k]; +#else int uniform_a_col_id = utils::shfl( a_col_id, shfl_offset + my_k ); +#endif Vector_type my_Delta(0); if ( uniform_a_col_id != -1 ) @@ -2794,8 +3060,17 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, // Load EINV values. Matrix_type my_Einv = Einv[16 * a_row_id + thread_id_mod_16]; // Reduce delta terms. +#if __CUDA_ARCH__ >= 300 my_delta += utils::shfl_xor( my_delta, 4 ); my_delta += utils::shfl_xor( my_delta, 8 ); +#else + s_mem[threadIdx.x] = my_delta; + + if ( lane_id < 28 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 4]; } + + if ( lane_id < 24 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 8]; } + +#endif // Update the shared terms. if ( thread_id_mod_16_div_4 == 0 ) @@ -2806,8 +3081,17 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, // Update the diagonal term. my_delta = my_Einv * my_s_mem[thread_id_mod_16_mod_4]; // Regroup results. +#if __CUDA_ARCH__ >= 300 my_delta += utils::shfl_xor( my_delta, 1 ); my_delta += utils::shfl_xor( my_delta, 2 ); +#else + s_mem[threadIdx.x] = my_delta; + + if ( lane_id < 31 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 1]; } + + if ( lane_id < 30 ) { s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + 2]; } + +#endif // Store the results. int offset = 4 * a_row_id + thread_id_mod_16_div_4; Vector_type my_b(0), my_x(0); @@ -2832,10 +3116,10 @@ void DILU_backward_4x4_kernel_row_major_vec4( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, typename WeightType, int NUM_THREADS_PER_ROW, int CTA_SIZE, int WARP_SIZE > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 12 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_backward_1x1_kernel( const int *__restrict A_rows, const int *__restrict A_cols, @@ -2861,6 +3145,10 @@ void DILU_backward_1x1_kernel( const int *__restrict A_rows, const int lane_id = utils::lane_id(); // Constants. const int lane_id_mod_NTPR = lane_id % NUM_THREADS_PER_ROW; +#if __CUDA_ARCH__ < 300 + // Shared memory needed to exchange X and delta. + __shared__ volatile Vector_type s_mem[CTA_SIZE]; +#endif // Determine which NxN block the threads work with. int a_row_it = blockIdx.x * NUM_ROWS_PER_CTA + (threadIdx.x / NUM_THREADS_PER_ROW); @@ -2928,6 +3216,7 @@ void DILU_backward_1x1_kernel( const int *__restrict A_rows, } // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 #pragma unroll for ( int mask = NUM_THREADS_PER_ROW / 2 ; mask > 0 ; mask >>= 1 ) @@ -2935,6 +3224,18 @@ void DILU_backward_1x1_kernel( const int *__restrict A_rows, my_delta += utils::shfl_xor( my_delta, mask ); } +#else + s_mem[threadIdx.x] = my_delta; +#pragma unroll + + for ( int offset = NUM_THREADS_PER_ROW / 2 ; offset > 0 ; offset >>= 1 ) + if ( lane_id_mod_NTPR < offset ) + { + s_mem[threadIdx.x] = my_delta += s_mem[threadIdx.x + offset]; + } + +#endif + // Store the results. if ( lane_id_mod_NTPR == 0 ) { @@ -2949,10 +3250,10 @@ void DILU_backward_1x1_kernel( const int *__restrict A_rows, template< typename Matrix_type, typename Vector_type, typename WeightType, int N, int CTA_SIZE > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CTA_SIZE, 16 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CTA_SIZE, 16 ) +__launch_bounds__( CTA_SIZE, 8 ) #endif void DILU_backward_NxN_kernel_skip( Vector_type *__restrict x, const WeightType weight, diff --git a/core/src/solvers/multicolor_ilu_solver.cu b/core/src/solvers/multicolor_ilu_solver.cu index 35f00a68..a13e6f62 100644 --- a/core/src/solvers/multicolor_ilu_solver.cu +++ b/core/src/solvers/multicolor_ilu_solver.cu @@ -69,10 +69,10 @@ namespace multicolor_ilu_solver template __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CtaSize, 16 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CtaSize, 16 ) +__launch_bounds__( CtaSize, 8 ) #endif void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets, const IndexType *LU_smaller_color_offsets, @@ -97,7 +97,14 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets, const int halfLaneId = threadIdx.x % 16; const int halfLaneId_div_4 = halfLaneId / 4; const int halfLaneId_mod_4 = halfLaneId % 4; +#if __CUDA_ARCH__ < 300 + // Shared memory to broadcast column IDs. + __shared__ volatile int s_aColIds[CtaSize]; + // Each thread keeps its own pointer. + volatile int *my_s_aColIds = &s_aColIds[16 * halfWarpId]; +#else const int upperHalf = 16 * (laneId / 16); +#endif // Shared memory needed to exchange X and delta. __shared__ volatile ValueTypeB s_mem[CtaSize]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -156,6 +163,9 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets, aColId = aRowId; } +#if __CUDA_ARCH__ < 300 + my_s_aColIds[halfLaneId] = aColId; +#endif // Count the number of active columns. int vote = utils::ballot(aColId != -1, active_mask); // The number of iterations. @@ -166,7 +176,11 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets, { int my_k = k + halfLaneId_div_4; // Load 8 blocks of X. +#if __CUDA_ARCH__ < 300 + int waColId = my_s_aColIds[my_k]; +#else int waColId = utils::shfl( aColId, upperHalf + my_k, warpSize, active_mask ); +#endif ValueTypeB my_x(0); if ( waColId != -1 ) @@ -229,6 +243,10 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets, aColId = LU_column_indices[aColIt]; } +#if __CUDA_ARCH__ < 300 + my_s_aColIds[halfLaneId] = aColId; +#endif + // Count the number of active columns. int vote = utils::ballot(aColId != -1, active_mask); // The number of iterations. @@ -238,7 +256,11 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets, { int my_k = k + halfLaneId_div_4; // Load 8 blocks of X. +#if __CUDA_ARCH__ < 300 + int waColId = my_s_aColIds[my_k]; +#else int waColId = utils::shfl( aColId, upperHalf + my_k, warpSize, active_mask ); +#endif ValueTypeB my_delta(0); if ( waColId != -1 ) @@ -280,6 +302,8 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets, } // If current_color != 0 // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 + if ( ROW_MAJOR ) { my_bmAx += utils::shfl_xor( my_bmAx, 1, warpSize, active_mask ); @@ -291,6 +315,24 @@ void LU_forward_4x4_kernel_warp( const IndexType *LU_row_offsets, my_bmAx += utils::shfl_xor( my_bmAx, 8, warpSize, active_mask ); } +#else + s_mem[threadIdx.x] = my_bmAx; + + if ( ROW_MAJOR ) + { + if ( laneId < 31 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 1]; } + + if ( laneId < 30 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 2]; } + } + else + { + if ( laneId < 28 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 4]; } + + if ( laneId < 24 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 8]; } + } + +#endif + // Store the results. if ( ROW_MAJOR ) { @@ -427,10 +469,10 @@ void LU_forward_4x4_kernel(const IndexType *LU_row_offsets, const IndexType *LU_ template< typename IndexType, typename ValueTypeA, typename ValueTypeB, int CtaSize, bool ROW_MAJOR > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CtaSize, 16 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CtaSize, 16 ) +__launch_bounds__( CtaSize, 8 ) #endif void LU_backward_4x4_kernel_warp( const IndexType *row_offsets, const IndexType *larger_color_offsets, @@ -454,7 +496,14 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets, const int halfLaneId = threadIdx.x % 16; const int halfLaneId_div_4 = halfLaneId / 4; const int halfLaneId_mod_4 = halfLaneId % 4; +#if __CUDA_ARCH__ < 300 + // Shared memory to broadcast column IDs. + __shared__ volatile int s_aColIds[CtaSize]; + // Each thread keeps its own pointer. + volatile int *my_s_aColIds = &s_aColIds[16 * halfWarpId]; +#else const int upperHalf = 16 * (laneId / 16); +#endif // Shared memory needed to exchange X and delta. __shared__ volatile ValueTypeB s_mem[CtaSize]; // Each thread keeps its own pointer to shared memory to avoid some extra computations. @@ -501,12 +550,20 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets, aColId = column_indices[aColIt]; } +#if __CUDA_ARCH__ < 300 + my_s_aColIds[halfLaneId] = aColId; +#endif + // Loop over columns. We compute 8 columns per iteration. for ( int k = 0 ; k < 16 ; k += 4 ) { int my_k = k + halfLaneId_div_4; // Exchange column indices. +#if __CUDA_ARCH__ < 300 + int waColId = my_s_aColIds[my_k]; +#else int waColId = utils::shfl( aColId, upperHalf + my_k, warpSize, active_mask ); +#endif // Load 8 blocks of X if needed. ValueTypeB *my_ptr = Delta; @@ -555,6 +612,8 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets, } // Loop over aColIt // Reduce bmAx terms. +#if __CUDA_ARCH__ >= 300 + if ( ROW_MAJOR ) { my_bmAx += utils::shfl_xor( my_bmAx, 1, warpSize, active_mask ); @@ -565,6 +624,24 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets, my_bmAx += utils::shfl_xor( my_bmAx, 4, warpSize, active_mask ); my_bmAx += utils::shfl_xor( my_bmAx, 8, warpSize, active_mask ); } + +#else + s_mem[threadIdx.x] = my_bmAx; + + if ( ROW_MAJOR ) + { + if ( laneId < 31 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 1]; } + + if ( laneId < 30 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 2]; } + } + else + { + if ( laneId < 28 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 4]; } + + if ( laneId < 24 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 8]; } + } + +#endif } // if current_color != num_colors-1 // Update the shared terms. @@ -603,6 +680,8 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets, } // Regroup results. +#if __CUDA_ARCH__ >= 300 + if ( ROW_MAJOR ) { my_bmAx += utils::shfl_xor( my_bmAx, 1 ); @@ -614,6 +693,24 @@ void LU_backward_4x4_kernel_warp( const IndexType *row_offsets, my_bmAx += utils::shfl_xor( my_bmAx, 8 ); } +#else + s_mem[threadIdx.x] = my_bmAx; + + if ( ROW_MAJOR ) + { + if ( laneId < 31 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 1]; } + + if ( laneId < 30 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 2]; } + } + else + { + if ( laneId < 28 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 4]; } + + if ( laneId < 24 ) { s_mem[threadIdx.x] = my_bmAx += s_mem[threadIdx.x + 8]; } + } + +#endif + // Store the results. if ( ROW_MAJOR ) { @@ -989,10 +1086,10 @@ computeAtoLUmappingExtDiag_kernel( int A_nRows, template< typename ValueTypeA, int CtaSize, int SMemSize, bool ROW_MAJOR > __global__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 __launch_bounds__( CtaSize, 12 ) #elif defined(__CUDA_ARCH__) -__launch_bounds__( CtaSize, 12 ) +__launch_bounds__( CtaSize, 8 ) #endif void compute_LU_factors_4x4_kernel_warp( int A_nRows, @@ -1021,7 +1118,12 @@ compute_LU_factors_4x4_kernel_warp( int A_nRows, __shared__ volatile ValueTypeA s_C_mtx[nWarps][32]; __shared__ volatile ValueTypeA s_F_mtx[nWarps][16]; // Shared memory to store the proposed column to load +#if __CUDA_ARCH__ < 300 + __shared__ volatile int s_aColItToLoad [nWarps][32]; + __shared__ volatile int s_waColItToLoad[nWarps][32]; +#else __shared__ volatile int s_aColSrc[nWarps][32]; +#endif // Shared memory to store the column indices of the current row __shared__ volatile int s_keys[nWarps][SMemSize]; @@ -1143,7 +1245,12 @@ compute_LU_factors_4x4_kernel_warp( int A_nRows, if (pred) { +#if __CUDA_ARCH__ < 300 + s_aColItToLoad [warpId][idst] = found_aColIt; + s_waColItToLoad[warpId][idst] = waColIt; +#else s_aColSrc[warpId][idst] = laneId; +#endif } utils::syncwarp(active_mask); @@ -1156,6 +1263,13 @@ compute_LU_factors_4x4_kernel_warp( int A_nRows, // Where to get columns from. int a_col_it = -1, w_col_it = -1; // Load column to load +#if __CUDA_ARCH__ < 300 + if ( my_k < n_cols ) + { + a_col_it = s_aColItToLoad [warpId][my_k]; + w_col_it = s_waColItToLoad[warpId][my_k]; + } +#else a_col_it = utils::shfl(found_aColIt, s_aColSrc[warpId][my_k], warpSize, active_mask); w_col_it = utils::shfl(waColIt, s_aColSrc[warpId][my_k], warpSize, active_mask); @@ -1165,6 +1279,7 @@ compute_LU_factors_4x4_kernel_warp( int A_nRows, w_col_it = -1; } +#endif ValueTypeA my_C(0); if ( w_col_it != -1 ) diff --git a/examples/amgx_spmv_example/Makefile b/examples/amgx_spmv_example/Makefile index db08516e..cb0ae6da 100644 --- a/examples/amgx_spmv_example/Makefile +++ b/examples/amgx_spmv_example/Makefile @@ -16,7 +16,7 @@ EXTRA_LIBS = -lcublas -lcusparse -lcusolver -Xlinker=-rpath=$(CUDA_PATH)/lib64 AMGX_ROOT = ../.. AMGX_INCLUDE = -I$(AMGX_ROOT)/../../thrust -I$(AMGX_ROOT)/base/include -I$(CUDA_PATH)/include -I$(AMGX_ROOT)/external/rapidjson/include -I$(MPI_PATH)/include -BASE_CU_FILES := ../../base/src/misc.cu ../../base/src/device_properties.cu ../../base/src/logger.cu ../../base/src/auxdata.cu ../../base/src/amgx_cusparse.cu ../../base/src/amgx_cublas.cu ../../base/src/amg_config.cu ../../base/src/global_thread_handle.cu ../../base/src/error.cu ../../base/src/thread_manager.cu ../../base/src/resources.cu ../../base/src/matrix.cu ../../base/src/multiply.cu ../../base/src/hash_workspace.cu ../../base/src/csr_multiply.cu ../../base/src/csr_multiply_sm70.cu ../../base/src/csr_multiply_sm35.cu ../../base/src/matrix_coloring/matrix_coloring.cu ../../base/src/distributed/distributed_manager.cu ../../base/src/distributed/distributed_arranger.cu ../../base/src/distributed/comms_visitors1.cu ../../base/src/distributed/comms_visitors2.cu ../../base/src/distributed/comms_visitors3.cu ../../base/src/distributed/comms_visitors4.cu ../../base/src/distributed/comms_mpi_hostbuffer_stream.cu ../../base/src/distributed/comms_mpi_gpudirect.cu ../../base/src/distributed/distributed_comms.cu +BASE_CU_FILES := ../../base/src/misc.cu ../../base/src/device_properties.cu ../../base/src/logger.cu ../../base/src/auxdata.cu ../../base/src/amgx_cusparse.cu ../../base/src/amgx_cublas.cu ../../base/src/amg_config.cu ../../base/src/global_thread_handle.cu ../../base/src/error.cu ../../base/src/thread_manager.cu ../../base/src/resources.cu ../../base/src/matrix.cu ../../base/src/multiply.cu ../../base/src/hash_workspace.cu ../../base/src/csr_multiply.cu ../../base/src/csr_multiply_sm20.cu ../../base/src/csr_multiply_sm35.cu ../../base/src/matrix_coloring/matrix_coloring.cu ../../base/src/distributed/distributed_manager.cu ../../base/src/distributed/distributed_arranger.cu ../../base/src/distributed/comms_visitors1.cu ../../base/src/distributed/comms_visitors2.cu ../../base/src/distributed/comms_visitors3.cu ../../base/src/distributed/comms_visitors4.cu ../../base/src/distributed/comms_mpi_hostbuffer_stream.cu ../../base/src/distributed/comms_mpi_gpudirect.cu ../../base/src/distributed/distributed_comms.cu OBJDIR_S_GPU := obj_s OBJDIR_M_GPU := obj_m