Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #220 from gonzalobg/bugfix/memcpy_async_sz
Browse files Browse the repository at this point in the history
memcpy_async should cache only in L2 when possible
  • Loading branch information
wmaxey authored Nov 5, 2021
2 parents 5834a3b + 652c092 commit 4f42427
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions include/cuda/std/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ template<>
struct __memcpy_async_impl<4, false> {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) {
asm volatile ("cp.async.ca.shared.global [%0], [%1], 4;"
asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
"l"(__source + __offset)
: "memory");
Expand All @@ -319,7 +319,7 @@ template<>
struct __memcpy_async_impl<8, false> {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) {
asm volatile ("cp.async.ca.shared.global [%0], [%1], 8;"
asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
"l"(__source + __offset)
: "memory");
Expand All @@ -332,7 +332,7 @@ template<>
struct __memcpy_async_impl<16, false> {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) {
asm volatile ("cp.async.ca.shared.global [%0], [%1], 16;"
asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
"l"(__source + __offset)
: "memory");
Expand Down

0 comments on commit 4f42427

Please sign in to comment.