From 4da1f61553eff543934e6777d205e0b4c688c279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 1 Nov 2024 12:35:48 -0700 Subject: [PATCH] [flang][cuda] Data transfer with descriptor (#114598) Reopen PR #114302 as it was automatically closed. Review in #114302 --- flang/runtime/CUDA/memory.cpp | 34 +++++++++++++++++++-- flang/unittests/Runtime/CUDA/Memory.cpp | 40 +++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp index d03f1cc0e48d661..daf1db684a3d2ed 100644 --- a/flang/runtime/CUDA/memory.cpp +++ b/flang/runtime/CUDA/memory.cpp @@ -9,10 +9,32 @@ #include "flang/Runtime/CUDA/memory.h" #include "../terminator.h" #include "flang/Runtime/CUDA/common.h" +#include "flang/Runtime/assign.h" #include "cuda_runtime.h" namespace Fortran::runtime::cuda { +static void *MemmoveHostToDevice( + void *dst, const void *src, std::size_t count) { + // TODO: Use cudaMemcpyAsync when we have support for stream. + CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice)); + return dst; +} + +static void *MemmoveDeviceToHost( + void *dst, const void *src, std::size_t count) { + // TODO: Use cudaMemcpyAsync when we have support for stream. + CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost)); + return dst; +} + +static void *MemmoveDeviceToDevice( + void *dst, const void *src, std::size_t count) { + // TODO: Use cudaMemcpyAsync when we have support for stream. + CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice)); + return dst; +} + extern "C" { void *RTDEF(CUFMemAlloc)( @@ -90,8 +112,16 @@ void RTDEF(CUFDataTransferPtrDesc)(void *addr, Descriptor *desc, void RTDECL(CUFDataTransferDescDesc)(Descriptor *dstDesc, Descriptor *srcDesc, unsigned mode, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; - terminator.Crash( - "not yet implemented: CUDA data transfer between two descriptors"); + MemmoveFct memmoveFct; + if (mode == kHostToDevice) { + memmoveFct = &MemmoveHostToDevice; + } else if (mode == kDeviceToHost) { + memmoveFct = &MemmoveDeviceToHost; + } else if (mode == kDeviceToDevice) { + memmoveFct = &MemmoveDeviceToDevice; + } + Fortran::runtime::Assign( + *dstDesc, *srcDesc, terminator, MaybeReallocate, memmoveFct); } } } // namespace Fortran::runtime::cuda diff --git a/flang/unittests/Runtime/CUDA/Memory.cpp b/flang/unittests/Runtime/CUDA/Memory.cpp index 157d3cdb531def3..3492b72aac09194 100644 --- a/flang/unittests/Runtime/CUDA/Memory.cpp +++ b/flang/unittests/Runtime/CUDA/Memory.cpp @@ -9,11 +9,17 @@ #include "flang/Runtime/CUDA/memory.h" #include "gtest/gtest.h" #include "../../../runtime/terminator.h" +#include "../tools.h" #include "flang/Common/Fortran.h" +#include "flang/Runtime/CUDA/allocator.h" #include "flang/Runtime/CUDA/common.h" +#include "flang/Runtime/CUDA/descriptor.h" +#include "flang/Runtime/allocatable.h" +#include "flang/Runtime/allocator-registry.h" #include "cuda_runtime.h" +using namespace Fortran::runtime; using namespace Fortran::runtime::cuda; TEST(MemoryCUFTest, SimpleAllocTramsferFree) { @@ -29,3 +35,37 @@ TEST(MemoryCUFTest, SimpleAllocTramsferFree) { EXPECT_EQ(42, host); RTNAME(CUFMemFree)((void *)dev, kMemTypeDevice, __FILE__, __LINE__); } + +static OwningPtr createAllocatable( + Fortran::common::TypeCategory tc, int kind, int rank = 1) { + return Descriptor::Create(TypeCode{tc, kind}, kind, nullptr, rank, nullptr, + CFI_attribute_allocatable); +} + +TEST(MemoryCUFTest, CUFDataTransferDescDesc) { + using Fortran::common::TypeCategory; + RTNAME(CUFRegisterAllocator)(); + // INTEGER(4), DEVICE, ALLOCATABLE :: a(:) + auto dev{createAllocatable(TypeCategory::Integer, 4)}; + dev->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, dev->GetAllocIdx()); + RTNAME(AllocatableSetBounds)(*dev, 0, 1, 10); + RTNAME(AllocatableAllocate) + (*dev, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(dev->IsAllocated()); + + // Create temp array to transfer to device. + auto x{MakeArray(std::vector{10}, + std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9})}; + RTNAME(CUFDataTransferDescDesc)(dev.get(), x.get(), kHostToDevice, __FILE__, __LINE__); + + // Retrieve data from device. + auto host{MakeArray(std::vector{10}, + std::vector{0, 0, 0, 0, 0, 0, 0, 0, 0, 0})}; + RTNAME(CUFDataTransferDescDesc) + (host.get(), dev.get(), kDeviceToHost, __FILE__, __LINE__); + + for (unsigned i = 0; i < 10; ++i) { + EXPECT_EQ(*host->ZeroBasedIndexedElement(i), (std::int32_t)i); + } +}