Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional SYCL USM (device pointer explicit copy) and CUDA tuning for DOT #122

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ register_model(hip HIP HIPStream.cpp)
register_model(cuda CUDA CUDAStream.cu)
register_model(kokkos KOKKOS KokkosStream.cpp)
register_model(sycl SYCL SYCLStream.cpp)
register_model(syclusm SYCLUSM SYCLStream.cpp)
register_model(sycl2020 SYCL2020 SYCLStream2020.cpp)
register_model(acc ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA
Expand Down
8 changes: 8 additions & 0 deletions src/cuda/CUDAStream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#include "CUDAStream.h"

int DOT_NUM_BLOCKS;

void check_error(void)
{
cudaError_t err = cudaGetLastError();
Expand Down Expand Up @@ -38,10 +40,16 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
throw std::runtime_error("Invalid device index");
cudaSetDevice(device_index);
check_error();
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_index);
check_error();

// Print out device information
std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
DOT_NUM_BLOCKS = 4 * prop.multiProcessorCount;
std::cout << "dot " << DOT_NUM_BLOCKS << " " << TBSIZE << " " << array_size
<< "\n";

array_size = ARRAY_SIZE;

Expand Down
2 changes: 1 addition & 1 deletion src/cuda/CUDAStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#endif

#define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
extern int DOT_NUM_BLOCKS;

template <class T>
class CUDAStream : public Stream<T>
Expand Down
15 changes: 10 additions & 5 deletions src/hip/HIPStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
#include "HIPStream.h"
#include "hip/hip_runtime.h"

#define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
#define TBSIZE 256
int DOT_NUM_BLOCKS;

void check_error(void)
{
Expand Down Expand Up @@ -42,19 +42,24 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
hipSetDevice(device_index);
check_error();

// get properties
hipDeviceProp_t props;
hipGetDeviceProperties(&props, 0);
DOT_NUM_BLOCKS = props.multiProcessorCount * 4;

// Print out device information
std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
std::cout << "DOT_NUM_BLOCKS: " << DOT_NUM_BLOCKS << " TBSIZE " <<
TBSIZE << "\n";

array_size = ARRAY_SIZE;

// Allocate the host array for partial sums for dot kernels
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);

// Check buffers fit on the device
hipDeviceProp_t props;
hipGetDeviceProperties(&props, 0);
if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
throw std::runtime_error("Device does not have enough memory for all 3 buffers");

// Create device buffers
Expand Down
4 changes: 3 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
#include "SYCLStream.h"
#elif defined(SYCL2020)
#include "SYCLStream2020.h"
#elif defined(SYCLUSM)
#include "SYCLStreamUSM.h"
#elif defined(OMP)
#include "OMPStream.h"
#endif
Expand Down Expand Up @@ -290,7 +292,7 @@ void run()
// Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);

#elif defined(SYCL) || defined(SYCL2020)
#elif defined(SYCL) || defined(SYCL2020) || defined(SYCLUSM)
// Use the SYCL implementation
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);

Expand Down
2 changes: 1 addition & 1 deletion src/sycl/SYCLStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
else
{
dot_num_groups = dev.get_info<info::device::max_compute_units>() * 4;
dot_wgsize = dev.get_info<info::device::max_work_group_size>();
dot_wgsize = 256; // good for AMD, doesn't hurt elsewhere
}

// Print out device information
Expand Down
Loading