Skip to content

Commit

Permalink
[cudadev] Moved accesses from value to const ref so that we get the b…
Browse files Browse the repository at this point in the history
…enefit of __restrict__

Created an example to easily validate the effect __restrict__

The result can be checked compiling with the `-ptx` option instead of `-c`, and then grepped with:.

```
$ cat obj/cudadev/test/SoAStoreAndView_t.cu.ptx | c++filt | egrep '(.visible|(ld|st).global)' --color
.visible .entry aAMDef(SoA1ViewTemplate<128ul, (cms::soa::AlignmentEnforcement)0, (cms::soa::CacheAccessStyle)0, (cms::soa::RestrictQualify)1>, unsigned long)(
        ld.global.f64   %fd1, [%rd21];
        ld.global.f64   %fd2, [%rd20];
        st.global.f64   [%rd22], %fd3;
        ld.global.f64   %fd4, [%rd21];
        ld.global.f64   %fd5, [%rd20];
        st.global.f64   [%rd23], %fd6;
.visible .entry aAMRestrict(SoA1ViewTemplate<128ul, (cms::soa::AlignmentEnforcement)0, (cms::soa::CacheAccessStyle)0, (cms::soa::RestrictQualify)0>, unsigned long)(
        ld.global.nc.f64        %fd1, [%rd21];
        ld.global.nc.f64        %fd2, [%rd20];
        st.global.f64   [%rd22], %fd3;
        st.global.f64   [%rd23], %fd4;
.visible .entry aAMNC(SoA1ViewTemplate<128ul, (cms::soa::AlignmentEnforcement)0, (cms::soa::CacheAccessStyle)1, (cms::soa::RestrictQualify)1>, unsigned long)(
        ld.global.f64   %fd1, [%rd21];
        ld.global.f64   %fd2, [%rd20];
        st.global.f64   [%rd22], %fd3;
        ld.global.f64   %fd4, [%rd21];
        ld.global.f64   %fd5, [%rd20];
        st.global.f64   [%rd23], %fd6;
.visible .entry aAMRestrict(SoA1ViewTemplate<128ul, (cms::soa::AlignmentEnforcement)0, (cms::soa::CacheAccessStyle)1, (cms::soa::RestrictQualify)0>, unsigned long)(
        ld.global.nc.f64        %fd1, [%rd21];
        ld.global.nc.f64        %fd2, [%rd20];
        st.global.f64   [%rd22], %fd3;
        st.global.f64   [%rd23], %fd4;
```

The hint from restrict qualifier is used by the compiler to load values from
global memory only once and via the non-coherent cache. The cache access styles
are not implemented, and hence have no effect.
  • Loading branch information
ericcano committed Jan 14, 2022
1 parent a2fad0a commit 87d084a
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 16 deletions.
34 changes: 24 additions & 10 deletions src/cudadev/DataFormats/SoACommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@

#if defined(__CUDACC__) && defined(__CUDA_ARCH__)
// Read a pointer content via read-only (non coherent) cache.
#define LOAD_INCOHERENT(A) __ldg(A)
#define LOAD_STREAMED(A) __ldcs(A)
#define STORE_STREAMED(A, V) __stcs(A, V)
#define LOAD_NONCOHERENT(A) __ldg(A)
#define LOAD_STREAMING(A) __ldcs(A)
#define STORE_STREAMING(A, V) __stcs(A, V)
#else
#define LOAD_INCOHERENT(A) *(A)
#define LOAD_STREAMED(A) *(A)
#define STORE_STREAMED(A, V) *(A) = (V)
#define LOAD_NONCOHERENT(A) *(A)
#define LOAD_STREAMING(A) *(A)
#define STORE_STREAMING(A, V) *(A) = (V)
#endif

// compile-time sized SoA
Expand Down Expand Up @@ -70,7 +70,7 @@ SOA_HOST_DEVICE_INLINE T readWithCacheStyle (const T * addr) {
if constexpr (CACHE_ACCESS_STYLE == CacheAccessStyle::NonCoherent) {
return LOAD_INCOHERENT(addr);
} else if constexpr (CACHE_ACCESS_STYLE == CacheAccessStyle::Streaming) {
return LOAD_STREAMED(addr);
return LOAD_STREAMING(addr);
}
return *addr;
}
Expand All @@ -88,10 +88,19 @@ class SoAValue {
typedef typename Restr::Pointer Ptr;
typedef typename Restr::Reference Ref;
typedef typename Restr::PointerToConst PtrToConst;
typedef typename Restr::ReferenceToConst RefToConst;
SOA_HOST_DEVICE_INLINE SoAValue(size_t i, T* col) : idx_(i), col_(col) {}
/* SOA_HOST_DEVICE_INLINE operator T&() { return col_[idx_]; } */
SOA_HOST_DEVICE_INLINE Ref operator()() { return alignedCol()[idx_]; }
SOA_HOST_DEVICE_INLINE Val operator()() const { return *(alignedCol() + idx_); }
SOA_HOST_DEVICE_INLINE Ref operator()() {
// Ptr type will add the restrict qualifyer if needed
Ptr col = alignedCol();
return col[idx_];
}
SOA_HOST_DEVICE_INLINE RefToConst operator()() const {
// PtrToConst type will add the restrict qualifyer if needed
PtrToConst col = alignedCol();
return col[idx_];
}
SOA_HOST_DEVICE_INLINE Ptr operator&() { return &alignedCol()[idx_]; }
SOA_HOST_DEVICE_INLINE PtrToConst operator&() const { return &alignedCol()[idx_]; }
template <typename T2>
Expand Down Expand Up @@ -123,9 +132,14 @@ class SoAConstValue {
typedef typename Restr::Pointer Ptr;
typedef typename Restr::Reference Ref;
typedef typename Restr::PointerToConst PtrToConst;
typedef typename Restr::ReferenceToConst RefToConst;
SOA_HOST_DEVICE_INLINE SoAConstValue(size_t i, const T* col) : idx_(i), col_(col) {}
/* SOA_HOST_DEVICE_INLINE operator T&() { return col_[idx_]; } */
SOA_HOST_DEVICE_INLINE T operator()() const { return *(alignedCol() + idx_); }
SOA_HOST_DEVICE_INLINE RefToConst operator()() const {
// Ptr type will add the restrict qualifyer if needed
PtrToConst col = alignedCol();
return col[idx_];
}
SOA_HOST_DEVICE_INLINE const T* operator&() const { return &alignedCol()[idx_]; }
typedef T valueType;
static constexpr auto valueSize = sizeof(T);
Expand Down
20 changes: 14 additions & 6 deletions src/cudadev/DataFormats/SoAView.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ struct ConstValueTraits<C, SoAColumnType::eigen> {
* Declaration of the members accessors of the const element subclass
*/
#define _DECLARE_VIEW_CONST_ELEMENT_ACCESSOR_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) \
SOA_HOST_DEVICE_INLINE typename BOOST_PP_CAT(SoAMetadata::TypeOf_, LOCAL_NAME) LOCAL_NAME() const { \
SOA_HOST_DEVICE_INLINE typename SoAConstValueWithConf<typename BOOST_PP_CAT(SoAMetadata::TypeOf_, LOCAL_NAME)>::RefToConst LOCAL_NAME() const { \
return BOOST_PP_CAT(LOCAL_NAME, _)(); \
}

Expand Down Expand Up @@ -286,9 +286,13 @@ struct ConstValueTraits<C, SoAColumnType::eigen> {
#define _DECLARE_VIEW_SOA_MEMBER(R, DATA, LAYOUT_MEMBER_NAME) \
BOOST_PP_EXPAND(_DECLARE_VIEW_SOA_MEMBER_IMPL BOOST_PP_TUPLE_PUSH_BACK(LAYOUT_MEMBER_NAME, DATA))

/* ---- MUTABLE VIEW -------------------------------------------------------------------------------------------------------------------- */

#define GENERATE_SOA_VIEW(CLASS, LAYOUTS_LIST, VALUE_LIST) \
template <size_t ALIGNMENT = cms::soa::CacheLineSize::defaultSize, \
cms::soa::AlignmentEnforcement ALIGNMENT_ENFORCEMENT = cms::soa::AlignmentEnforcement::Relaxed> \
cms::soa::AlignmentEnforcement ALIGNMENT_ENFORCEMENT = cms::soa::AlignmentEnforcement::Relaxed, \
cms::soa::CacheAccessStyle CACHE_ACCESS_STYLE = cms::soa::CacheAccessStyle::Default, \
cms::soa::RestrictQualify RESTRICT_QUALIFY = cms::soa::RestrictQualify::Disabled> \
struct CLASS { \
/* these could be moved to an external type trait to free up the symbol names */ \
using self_type = CLASS; \
Expand All @@ -303,12 +307,14 @@ struct ConstValueTraits<C, SoAColumnType::eigen> {
constexpr static AlignmentEnforcement alignmentEnforcement = ALIGNMENT_ENFORCEMENT; \
constexpr static size_t conditionalAlignment = \
alignmentEnforcement == AlignmentEnforcement::Enforced ? byteAlignment : 0; \
/* Those typedefs avoid having commas in macros (which is problematic) */ \
constexpr static cms::soa::CacheAccessStyle cacheAccessStyle = CACHE_ACCESS_STYLE; \
constexpr static cms::soa::RestrictQualify restrictQualify = RESTRICT_QUALIFY; \
/* Those typedefs avoid having commas in macros (which is problematic) */ \
template <class C> \
using SoAValueWithConf = cms::soa::SoAValue<C, conditionalAlignment>; \
using SoAValueWithConf = cms::soa::SoAValue<C, conditionalAlignment, cacheAccessStyle, restrictQualify>; \
\
template <class C> \
using SoAConstValueWithConf = cms::soa::SoAConstValue<C, conditionalAlignment>; \
using SoAConstValueWithConf = cms::soa::SoAConstValue<C, conditionalAlignment, cacheAccessStyle, restrictQualify>; \
\
template <class C> \
using SoAEigenValueWithConf = cms::soa::SoAEigenValue<C, conditionalAlignment>; \
Expand Down Expand Up @@ -395,11 +401,13 @@ struct ConstValueTraits<C, SoAColumnType::eigen> {
_ITERATE_ON_ALL(_DECLARE_VIEW_SOA_MEMBER, BOOST_PP_EMPTY(), VALUE_LIST) \
}

/* ---- CONST VIEW --------------------------------------------------------------------------------------------------------------------- */

#define GENERATE_SOA_CONST_VIEW(CLASS, LAYOUTS_LIST, VALUE_LIST) \
template <size_t ALIGNMENT = cms::soa::CacheLineSize::defaultSize, \
cms::soa::AlignmentEnforcement ALIGNMENT_ENFORCEMENT = cms::soa::AlignmentEnforcement::Relaxed, \
cms::soa::CacheAccessStyle CACHE_ACCESS_STYLE = cms::soa::CacheAccessStyle::NonCoherent, \
cms::soa::RestrictQualify RESTRICT_QUALIFY = cms::soa::RestrictQualify::Enabled> \
cms::soa::RestrictQualify RESTRICT_QUALIFY = cms::soa::RestrictQualify::Enabled> \
struct CLASS { \
/* these could be moved to an external type trait to free up the symbol names */ \
using self_type = CLASS; \
Expand Down
46 changes: 46 additions & 0 deletions src/cudadev/test/SoAStoreAndView_t.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ GENERATE_SOA_LAYOUT(SoA1LayoutTemplate,
SOA_COLUMN(double, x),
SOA_COLUMN(double, y),
SOA_COLUMN(double, z),
SOA_COLUMN(double, sum),
SOA_COLUMN(double, prod),
SOA_EIGEN_COLUMN(Eigen::Vector3d, a),
SOA_EIGEN_COLUMN(Eigen::Vector3d, b),
SOA_EIGEN_COLUMN(Eigen::Vector3d, r),
Expand All @@ -44,6 +46,8 @@ GENERATE_SOA_VIEW(SoA1ViewTemplate,
SOA_VIEW_VALUE(soa1, x),
SOA_VIEW_VALUE(soa1, y),
SOA_VIEW_VALUE(soa1, z),
SOA_VIEW_VALUE(soa1, sum),
SOA_VIEW_VALUE(soa1, prod),
SOA_VIEW_VALUE(soa1, color),
SOA_VIEW_VALUE(soa1, value),
SOA_VIEW_VALUE(soa1, py),
Expand Down Expand Up @@ -100,6 +104,48 @@ GENERATE_SOA_CONST_VIEW(SoA1View2Gconst,
)
);

// Parameter reusing kernels. The disassembly will indicate whether the compiler uses the wanted cache hits and uses
// `restrict` hints avoid multiple reduce loads.
// The PTX can be obtained using -ptx insterad of -c when compiling.
template <typename T>
__device__ void addAndMulTemplate (
T soa, size_t size) {
auto idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
auto si = soa[idx];
si.sum() = si.x() + si.y();
si.prod() = si.x() * si.y();
}

__global__ void aAMDef(SoA1ViewTemplate<cms::soa::CacheLineSize::defaultSize,
cms::soa::AlignmentEnforcement::Relaxed,
cms::soa::CacheAccessStyle::Default,
cms::soa::RestrictQualify::Disabled> soa, size_t size) {
addAndMulTemplate(soa, size);
}

__global__ void aAMRestrict(SoA1ViewTemplate<cms::soa::CacheLineSize::defaultSize,
cms::soa::AlignmentEnforcement::Relaxed,
cms::soa::CacheAccessStyle::Default,
cms::soa::RestrictQualify::Enabled> soa, size_t size) {
addAndMulTemplate(soa, size);
}

__global__ void aAMNC(SoA1ViewTemplate<cms::soa::CacheLineSize::defaultSize,
cms::soa::AlignmentEnforcement::Relaxed,
cms::soa::CacheAccessStyle::NonCoherent,
cms::soa::RestrictQualify::Disabled> soa, size_t size) {
addAndMulTemplate(soa, size);
}

__global__ void aAMRestrict(SoA1ViewTemplate<cms::soa::CacheLineSize::defaultSize,
cms::soa::AlignmentEnforcement::Relaxed,
cms::soa::CacheAccessStyle::NonCoherent,
cms::soa::RestrictQualify::Enabled> soa, size_t size) {
addAndMulTemplate(soa, size);
}


const size_t size=10000;

int main() {
Expand Down

0 comments on commit 87d084a

Please sign in to comment.