Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for AMD GPUs via HIP #116

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ CFLAGS ?= -fPIC -O3 -funroll-loops -march=native
CXXFLAGS ?= $(CFLAGS) -std=c++14
NVCCFLAGS ?= -std=c++14 -ccbin=$(CXX) -O3 $(NVARCH) -Wno-deprecated-gpu-targets \
--default-stream per-thread -Xcompiler "$(CXXFLAGS)"
DEVFLAGS ?= --device-c

# For debugging, tell nvcc to add symbols to host and device code respectively,
#NVCCFLAGS+= -g -G
Expand All @@ -51,7 +52,8 @@ else
endif

# Common includes
INC += -I$(CUDA_ROOT)/include -Icontrib/cuda_samples
INC ?= -I$(CUDA_ROOT)/include
INC += -Icontrib -Icontrib/cuda_samples

# NVCC-specific libs
NVCC_LIBS_PATH += -L$(CUDA_ROOT)/lib64
Expand All @@ -63,7 +65,7 @@ ifdef NVCC_STUBS
NVCC_LIBS_PATH += -L$(NVCC_STUBS)
endif

LIBS += -lm -lcudart -lstdc++ -lnvToolsExt -lcufft -lcuda
LIBS ?= -lm -lcudart -lstdc++ -lnvToolsExt -lcufft -lcuda


#############################################################
Expand All @@ -73,7 +75,8 @@ LIBS += -lm -lcudart -lstdc++ -lnvToolsExt -lcufft -lcuda
# Include header files
INC += -I include

LIBNAME=libcufinufft
LIBNAME_SHORT=cufinufft
LIBNAME=lib$(LIBNAME_SHORT)
DYNAMICLIB=lib/$(LIBNAME).so
STATICLIB=lib-static/$(LIBNAME).a

Expand Down Expand Up @@ -104,13 +107,13 @@ CUFINUFFTOBJS_32=$(CUFINUFFTOBJS_64:%.o=%_32.o)
%_32.o: %.c $(HEADERS)
$(CC) -DSINGLE -c $(CFLAGS) $(INC) $< -o $@
%_32.o: %.cu $(HEADERS)
$(NVCC) -DSINGLE --device-c -c $(NVCCFLAGS) $(INC) $< -o $@
$(NVCC) -DSINGLE $(DEVFLAGS) -c $(NVCCFLAGS) $(INC) $< -o $@
%.o: %.cpp $(HEADERS)
$(CXX) -c $(CXXFLAGS) $(INC) $< -o $@
%.o: %.c $(HEADERS)
$(CC) -c $(CFLAGS) $(INC) $< -o $@
%.o: %.cu $(HEADERS)
$(NVCC) --device-c -c $(NVCCFLAGS) $(INC) $< -o $@
$(NVCC) $(DEVFLAGS) -c $(NVCCFLAGS) $(INC) $< -o $@

default: all

Expand Down Expand Up @@ -158,11 +161,11 @@ examples: $(BINDIR)/example2d1many \

$(BINDIR)/example%: examples/example%.cpp $(DYNAMICLIB) $(HEADERS)
mkdir -p $(BINDIR)
$(NVCC) $(NVCCFLAGS) $(INC) $(LIBS) -o $@ $< $(DYNAMICLIB)
$(NVCC) $(NVCCFLAGS) $(INC) $(LIBS) -o $@ $< -Llib -l$(LIBNAME_SHORT)

$(BINDIR)/cufinufft2d2api_test%: test/cufinufft2d2api_test%.o $(DYNAMICLIB)
mkdir -p $(BINDIR)
$(NVCC) $(NVCCFLAGS) $(LIBS) -o $@ $< $(DYNAMICLIB)
$(NVCC) $(NVCCFLAGS) $(LIBS) -o $@ $< -Llib -l$(LIBNAME_SHORT)

$(BINDIR)/%_32: test/%_32.o $(CUFINUFFTOBJS_32) $(CUFINUFFTOBJS)
mkdir -p $(BINDIR)
Expand Down Expand Up @@ -237,11 +240,11 @@ check2D_64: spreadtest libtest
bin/cufinufft2d1many_test 2 512 512 256
bin/cufinufft2d1many_test 1 1e2 2e2 3e2 16 1e4
bin/cufinufft2d1many_test 2 1e2 2e2 3e2 16 1e4
bin/cufinufft2d2many_test 1 64 64 128 1e-3
bin/cufinufft2d2many_test 1 64 64 128 1e-3 # This test fails on Crusher ROCm/4.5.0
bin/cufinufft2d2many_test 1 256 256 1024
bin/cufinufft2d2many_test 2 512 512 256
bin/cufinufft2d2many_test 1 256 256 1024
bin/cufinufft2d2many_test 1 1e2 2e2 3e2 16 1e4
bin/cufinufft2d2many_test 1 1e2 2e2 3e2 16 1e4 # This test fails on Crusher ROCm/4.5.0
bin/cufinufft2d2many_test 2 1e2 2e2 3e2 16 1e4

check2D_32: spreadtest libtest
Expand Down
81 changes: 81 additions & 0 deletions contrib/cuda_hip_wrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#ifndef CUDA_HIP_WRAPPER_H
#define CUDA_HIP_WRAPPER_H

#ifdef USE_HIP

#include <hip/hip_runtime.h>
#include <hip/hip_complex.h>
#include <hipfft.h>

// cuda.h adapters
#define cudaDeviceReset hipDeviceReset
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaEventCreate hipEventCreate
#define cudaEventElapsedTime hipEventElapsedTime
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaFree hipFree
#define cudaFreeHost hipHostFree // hipFreeHost is deprecated
#define cudaGetDevice hipGetDevice
#define cudaGetErrorName hipGetErrorName
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaMalloc hipMalloc
#define cudaMallocHost hipHostMalloc // hipMallocHost is deprecated
#define cudaMemcpy hipMemcpy
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemset hipMemset
#define cudaSetDevice hipSetDevice
#define cudaStreamCreate hipStreamCreate
#define cudaStreamDestroy hipStreamDestroy
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess

// cuComplex.h adapters
#define cuDoubleComplex hipDoubleComplex
#define cuFloatComplex hipFloatComplex

// cufft.h adapters
#define CUFFT_ALLOC_FAILED HIPFFT_ALLOC_FAILED
#define CUFFT_C2C HIPFFT_C2C
#define CUFFT_EXEC_FAILED HIPFFT_EXEC_FAILED
#define CUFFT_INCOMPLETE_PARAMETER_LIST HIPFFT_INCOMPLETE_PARAMETER_LIST
#define CUFFT_INTERNAL_ERROR HIPFFT_INTERNAL_ERROR
#define CUFFT_INVALID_DEVICE HIPFFT_INVALID_DEVICE
#define CUFFT_INVALID_PLAN HIPFFT_INVALID_PLAN
#define CUFFT_INVALID_SIZE HIPFFT_INVALID_SIZE
#define CUFFT_INVALID_TYPE HIPFFT_INVALID_TYPE
#define CUFFT_INVALID_TYPE HIPFFT_INVALID_TYPE
#define CUFFT_INVALID_VALUE HIPFFT_INVALID_VALUE
#define CUFFT_NOT_IMPLEMENTED HIPFFT_NOT_IMPLEMENTED
#define CUFFT_NOT_SUPPORTED HIPFFT_NOT_SUPPORTED
#define CUFFT_NO_WORKSPACE HIPFFT_NO_WORKSPACE
#define CUFFT_PARSE_ERROR HIPFFT_PARSE_ERROR
#define CUFFT_SETUP_FAILED HIPFFT_SETUP_FAILED
#define CUFFT_SUCCESS HIPFFT_SUCCESS
#define CUFFT_UNALIGNED_DATA HIPFFT_UNALIGNED_DATA
#define CUFFT_Z2Z HIPFFT_Z2Z
#define cufftDestroy hipfftDestroy
#define cufftExecC2C hipfftExecC2C
#define cufftExecZ2Z hipfftExecZ2Z
#define cufftHandle hipfftHandle
#define cufftPlan1d hipfftPlan1d
#define cufftPlanMany hipfftPlanMany
#define cufftResult_t hipfftResult_t

// helper_cuda.h adapters
#define __DRIVER_TYPES_H__

#else

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuComplex.h>
#include <cufft.h>

#endif

#endif
2 changes: 1 addition & 1 deletion contrib/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <stdint.h>

#include <complex> // C++ type complex
#include <cuComplex.h>
#include "cuda_hip_wrapper.h"
#include "dataTypes.h"

// fraction growth cut-off in arraywidcen(), to decide if translate in type-3
Expand Down
2 changes: 1 addition & 1 deletion contrib/utils_fp.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include <stdint.h>

#include <complex> // C++ type complex
#include <cuComplex.h>
#include "cuda_hip_wrapper.h"
#include "dataTypes.h"


Expand Down
7 changes: 5 additions & 2 deletions examples/example2d1many.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

import numpy as np

import pycuda.autoinit
from pycuda.gpuarray import GPUArray, to_gpu
import PybindGPU.gpuarray as gpuarray
from PybindGPU.gpuarray import GPUArray, to_gpu

#import pycuda.autoinit
#from pycuda.gpuarray import GPUArray, to_gpu

from cufinufft import cufinufft

Expand Down
6 changes: 4 additions & 2 deletions examples/example2d2many.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

import numpy as np

import pycuda.autoinit
from pycuda.gpuarray import GPUArray, to_gpu
import PybindGPU.gpuarray as gpuarray
from PybindGPU.gpuarray import GPUArray, to_gpu
#import pycuda.autoinit
#from pycuda.gpuarray import GPUArray, to_gpu

from cufinufft import cufinufft

Expand Down
3 changes: 1 addition & 2 deletions include/cufinufft_eitherprec.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
// Make sure we don't include double or single headers more than once...

#include <cstdlib>
#include <cufft.h>
#include "cuda_hip_wrapper.h"
#include <assert.h>
#include <cuda_runtime.h>
#include "cufinufft_opts.h"
#include "../src/precision_independent.h"
#include "cufinufft_errors.h"
Expand Down
2 changes: 2 additions & 0 deletions include/cufinufft_errors.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ static const char* _cufftGetErrorEnum(cufftResult_t error)
return "cufft_no_workspace";
case CUFFT_NOT_IMPLEMENTED:
return "cufft_not_implemented";
#ifndef USE_HIP
case CUFFT_LICENSE_ERROR:
return "cufft_license_error";
#endif
case CUFFT_NOT_SUPPORTED:
return "cufft_not_supported";
}
Expand Down
1 change: 0 additions & 1 deletion python/cufinufft/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
numpy<1.22
pycuda
six
6 changes: 4 additions & 2 deletions python/cufinufft/tests/test_basic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import numpy as np

import pycuda.autoinit # NOQA:401
import pycuda.gpuarray as gpuarray
#import PybindGPU
import PybindGPU.gpuarray as gpuarray
#import pycuda.autoinit # NOQA:401
#import pycuda.gpuarray as gpuarray

from cufinufft import cufinufft

Expand Down
5 changes: 3 additions & 2 deletions python/cufinufft/tests/test_error_checks.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import numpy as np
import pytest

import pycuda.autoinit # NOQA:401
import pycuda.gpuarray as gpuarray
import PybindGPU.gpuarray as gpuarray
#import pycuda.autoinit # NOQA:401
#import pycuda.gpuarray as gpuarray

from cufinufft import cufinufft

Expand Down
19 changes: 12 additions & 7 deletions python/cufinufft/tests/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import numpy as np

import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import PybindGPU
import PybindGPU.gpuarray as gpuarray
#import pycuda.driver as drv
#import pycuda.gpuarray as gpuarray

from cufinufft import cufinufft

Expand All @@ -13,21 +15,24 @@
def test_multi_type1(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
complex_dtype = utils._complex_dtype(dtype)

drv.init()
#drv.init()

dev_count = drv.Device.count()
#dev_count = drv.Device.count()
dev_count, err = PybindGPU.cudaGetDeviceCount()

if dev_count == 1:
pytest.skip()

devs = [drv.Device(dev_id) for dev_id in range(dev_count)]
#devs = [drv.Device(dev_id) for dev_id in range(dev_count)]
devs = [PybindGPU.cudaDeviceProp(i) for i in range(dev_count)]

dim = len(shape)

errs = []

for dev_id, dev in enumerate(devs):
ctx = dev.make_context()
PybindGPU.cudaSetDevice(dev_id)
#ctx = dev.make_context()

k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
c = utils.gen_nonuniform_data(M).astype(complex_dtype)
Expand All @@ -54,7 +59,7 @@ def test_multi_type1(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):

print(f'Type 1 relative error (GPU {dev_id}):', type1_rel_err)

ctx.pop()
#ctx.pop()

errs.append(type1_rel_err)

Expand Down
9 changes: 9 additions & 0 deletions sites/make.inc.olcf_crusher
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# CC := hipcc
CXX := hipcc
NVCC := hipcc
NVARCH := --offload-arch=gfx90a
CXXFLAGS += -fPIC -DUSE_HIP -I$(ROCM_PATH)/include
NVCCFLAGS := -std=c++14 -O3 $(NVARCH) $(CXXFLAGS)
DEVFLAGS := -fgpu-rdc
INC :=
LIBS := -fgpu-rdc -L$(ROCM_PATH)/lib -lhipfft
9 changes: 9 additions & 0 deletions sites/make.inc.olcf_spock
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# CC := hipcc
CXX := hipcc
NVCC := hipcc
NVARCH := --amdgpu-target=gfx908
CXXFLAGS += -fPIC -DUSE_HIP -I$(ROCM_PATH)/hipfft/include
NVCCFLAGS := -std=c++14 -O3 $(NVARCH) $(CXXFLAGS)
DEVFLAGS := -fgpu-rdc
INC :=
LIBS := -fgpu-rdc -L$(ROCM_PATH)/hipfft/lib -lhipfft
2 changes: 1 addition & 1 deletion src/1d/cufinufft1d.cu
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#include <iostream>
#include <iomanip>
#include <math.h>
#include "cuda_hip_wrapper.h"
#include <helper_cuda.h>
#include <complex>
#include <cufft.h>

#include <cufinufft_eitherprec.h>
#include "../cuspreadinterp.h"
Expand Down
3 changes: 1 addition & 2 deletions src/1d/interp1d_wrapper.cu
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#include <helper_cuda.h>
#include "cuda_hip_wrapper.h"
#include <iostream>
#include <iomanip>

#include <cuComplex.h>
#include "../cuspreadinterp.h"
#include "../memtransfer.h"
#include <profile.h>
Expand Down
2 changes: 1 addition & 1 deletion src/1d/spread1d_wrapper.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include "cuda_hip_wrapper.h"
#include <helper_cuda.h>
#include <iostream>
#include <iomanip>
Expand All @@ -6,7 +7,6 @@
#include <thrust/device_ptr.h>
#include <thrust/scan.h>

#include <cuComplex.h>
#include "../cuspreadinterp.h"
#include "../memtransfer.h"

Expand Down
2 changes: 1 addition & 1 deletion src/1d/spreadinterp1d.cu
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <iostream>
#include <math.h>
#include "cuda_hip_wrapper.h"
#include <helper_cuda.h>
#include <cuda.h>
#include <thrust/extrema.h>
#include "../../contrib/utils.h"
#include "../../contrib/utils_fp.h"
Expand Down
2 changes: 1 addition & 1 deletion src/2d/cufinufft2d.cu
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#include <iostream>
#include <iomanip>
#include <math.h>
#include "cuda_hip_wrapper.h"
#include <helper_cuda.h>
#include <complex>
#include <cufft.h>

#include <cufinufft_eitherprec.h>
#include "../cuspreadinterp.h"
Expand Down
2 changes: 1 addition & 1 deletion src/2d/interp2d_wrapper.cu
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include "cuda_hip_wrapper.h"
#include <helper_cuda.h>
#include <iostream>
#include <iomanip>

#include <cuComplex.h>
#include "../cuspreadinterp.h"
#include "../memtransfer.h"
#include <profile.h>
Expand Down
Loading