Skip to content
This repository has been archived by the owner on Feb 7, 2023. It is now read-only.

Commit

Permalink
CUDA 9 support
Browse files Browse the repository at this point in the history
Summary:
Adds support for the CUDA 9 toolkit.

Includes new fp16 data type fixes, and changes to warp-synchronous programming. Also updates CUB third-party repo for CUDA 9 support.
Closes #853

Differential Revision: D5548507

Pulled By: Yangqing

fbshipit-source-id: c7fd2edb623f2aa8c67b9a1000efc8f71e6832ab
  • Loading branch information
slayton58 authored and facebook-github-bot committed Aug 6, 2017
1 parent c514d77 commit 78fbcaa
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 35 deletions.
4 changes: 4 additions & 0 deletions caffe2/operators/top_k_heap_selection.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,11 @@ warpHeap(K k, V v, K& keyHeapHead, K* keyHeap, V* valueHeap) {
bool wantInsert = Dir ? (k > keyHeapHead) : (k < keyHeapHead);

// Find out all the lanes that have elements to add to the heap
#if CUDA_VERSION >= 9000
unsigned int vote = __ballot_sync(__activemask(), wantInsert);
#else
unsigned int vote = __ballot(wantInsert);
#endif

if (!vote) {
// Everything the warp has is smaller than our heap
Expand Down
4 changes: 4 additions & 0 deletions caffe2/operators/top_k_radix_selection.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,11 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize],
#pragma unroll
for (unsigned int j = 0; j < RadixSize; ++j) {
bool vote = hasVal && (digitInRadix == j);
#if CUDA_VERSION >= 9000
counts[j] += __popc(__ballot_sync(__activemask(), vote));
#else
counts[j] += __popc(__ballot(vote));
#endif
}
}

Expand Down
2 changes: 1 addition & 1 deletion caffe2/utils/GpuDefs.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace caffe2 {
// Static definition of GPU warp size for unrolling and code generation

#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ <= 620
#if __CUDA_ARCH__ <= 700
constexpr int kWarpSize = 32;
#else
#error Unknown __CUDA_ARCH__; please define parameters for compute capability
Expand Down
5 changes: 5 additions & 0 deletions caffe2/utils/GpuScanUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,12 @@ __device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunct
template <typename T, bool KillWARDependency, class BinaryFunction>
__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
// Within-warp, we use warp voting.
#if CUDA_VERSION >= 9000
T vote = __ballot_sync(__activemask(), in);
#else
T vote = __ballot(in);
#endif

T index = __popc(getLaneMaskLe() & vote);
T carry = __popc(vote);

Expand Down
72 changes: 51 additions & 21 deletions caffe2/utils/conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,41 +108,67 @@ inline float cpu_half2float(float16 h) {
}

}; // anonymous

#if __CUDACC__

#if CUDA_VERSION >= 9000
inline float16 halfToFloat16(half x) {
float16 r = *reinterpret_cast<float16*>(&x);
return r;
}

inline half float16ToHalf(const float16 x) {
__half_raw hr;
hr.x = x.x;
half r(hr);
return r;
}

inline half floatToHalf(const float x) {
float16 xh = cpu_float2half_rn(x);
return float16ToHalf(xh);
}

#else
inline float16 halfToFloat16(__half x) {
float16 r;
r.x = x.x;
return r;
}

inline __half float16ToHalf(const float16 x) {
__half r;
r.x = x.x;
return r;
}

inline half floatToHalf(const float x) {
float16 xh = cpu_float2half_rn(x);
return float16ToHalf(xh);
}
#endif // CUDA_VERSION

#endif // __CUDACC__

// general version: defer to static_cast
template <typename IN, typename OUT>
CONVERSIONS_DECL OUT To(const IN in) {
return static_cast<OUT>(in);
}

#if __CUDA_ARCH__
__device__ __inline__ __half inf_clip(__half h) {
int isi = __hisinf(h);
if (isi > 0) {
// Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
h.x = 0x7bffU;
} else if (isi < 0) {
// As above, negated
h.x = 0x7bffU ^ 0x8000;
}
return h;
}
#endif

// explicit for fp16
template <>
CONVERSIONS_DECL float16 To(const float in) {
#if __CUDA_ARCH__
// hacky interface between C2 fp16 and CUDA
float16 ret;
#if 0
// alternative truncation scheme
__half r;
r.x = __float2half_rn(in);
ret.x = inf_clip(r).x;
#if CUDA_VERSION >= 9000
half rh = static_cast<half>(in);
return halfToFloat16(rh);
#else
float16 ret;
ret.x = __float2half(in).x;
#endif
return ret;
#endif // CUDA_VERSION >= 9000
#else
return cpu_float2half_rn(in);
#endif
Expand All @@ -151,7 +177,11 @@ CONVERSIONS_DECL float16 To(const float in) {
template <>
CONVERSIONS_DECL float To(const float16 in) {
#if __CUDA_ARCH__
#if CUDA_VERSION >= 9000
__half_raw tmp;
#else
__half tmp;
#endif
tmp.x = in.x;
return __half2float(tmp);
#else
Expand Down
15 changes: 6 additions & 9 deletions caffe2/utils/math_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,10 @@ void Gemm<float16, CUDAContext>(
N));
} else if (math_type == TensorProto_DataType_FLOAT16) {
// convert alpha, beta from caffe2::float16 -> __half
__half alpha_fp16;
alpha_fp16.x = convert::To<float, float16>(alpha).x;
__half beta_fp16;
beta_fp16.x = convert::To<float, float16>(beta).x;
// convert alpha, beta from float -> __half
auto alpha_fp16 = convert::floatToHalf(alpha);
auto beta_fp16 = convert::floatToHalf(beta);
// call cublasHgemm
CUBLAS_CHECK(cublasHgemm(
context->cublas_handle(),
Expand Down Expand Up @@ -353,10 +352,8 @@ void Gemv<float16, CUDAContext>(
CUDA_R_16F,
LDC));
} else if (math_type == TensorProto_DataType_FLOAT16) {
__half alpha_fp16;
alpha_fp16.x = convert::To<float, float16>(alpha).x;
__half beta_fp16;
beta_fp16.x = convert::To<float, float16>(beta).x;
auto alpha_fp16 = convert::floatToHalf(alpha);
auto beta_fp16 = convert::floatToHalf(beta);
CUBLAS_CHECK(cublasHgemm(
context->cublas_handle(),
Expand Down
11 changes: 8 additions & 3 deletions cmake/Cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
set(Caffe2_known_gpu_archs8 "20 21(20) 30 35 50 52 60 61") # for CUDA 8.x
set(Caffe2_known_gpu_archs7 "20 21(20) 30 35 50 52") # for CUDA 7.x
# This list will be used for CUDA_ARCH_NAME = All option
set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs8}) # latest supported CUDA version is 8.x
set(Caffe2_known_gpu_archs "20 21(20) 30 35 50 52 60 61 70")
set(Caffe2_known_gpu_archs7 "20 21(20) 30 35 50 52")
set(Caffe2_known_gpu_archs8 "20 21(20) 30 35 50 52 60 61")

################################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
Expand Down Expand Up @@ -88,6 +90,8 @@ function(caffe2_select_nvcc_arch_flags out_variable)
set(__cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
set(__cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
set(__cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(__cuda_arch_bin ${Caffe2_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
Expand Down Expand Up @@ -187,11 +191,12 @@ elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs8})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
else() # CUDA 9.x or later version
message(STATUS "The CUDA version is not offcially supported yet, cmake process will continue")
endif()

include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
Expand Down
2 changes: 1 addition & 1 deletion third_party/cub
Submodule cub updated 54 files
+1 −0 .settings/.gitignore
+1 −1 .settings/language.settings.xml
+12 −0 CHANGE_LOG.TXT
+5 −0 common.mk
+4 −4 cub/agent/agent_histogram.cuh
+20 −18 cub/agent/agent_radix_sort_downsweep.cuh
+6 −6 cub/agent/agent_radix_sort_upsweep.cuh
+3 −3 cub/agent/agent_reduce.cuh
+13 −8 cub/agent/agent_reduce_by_key.cuh
+6 −9 cub/agent/agent_rle.cuh
+4 −4 cub/agent/agent_scan.cuh
+1 −1 cub/agent/agent_segment_fixup.cuh
+9 −9 cub/agent/agent_select_if.cuh
+10 −10 cub/agent/agent_spmv_csrt.cuh
+17 −17 cub/agent/agent_spmv_orig.cuh
+5 −5 cub/agent/agent_spmv_row_based.cuh
+5 −5 cub/agent/single_pass_scan_operators.cuh
+8 −8 cub/block/block_adjacent_difference.cuh
+8 −8 cub/block/block_discontinuity.cuh
+23 −23 cub/block/block_exchange.cuh
+1 −1 cub/block/block_histogram.cuh
+18 −6 cub/block/block_load.cuh
+2 −2 cub/block/block_radix_rank.cuh
+6 −6 cub/block/block_radix_sort.cuh
+16 −16 cub/block/block_scan.cuh
+4 −4 cub/block/block_shuffle.cuh
+9 −3 cub/block/block_store.cuh
+3 −3 cub/block/specializations/block_histogram_sort.cuh
+1 −1 cub/block/specializations/block_reduce_raking.cuh
+2 −2 cub/block/specializations/block_reduce_raking_commutative_only.cuh
+1 −1 cub/block/specializations/block_reduce_warp_reductions.cuh
+18 −17 cub/block/specializations/block_scan_raking.cuh
+3 −3 cub/block/specializations/block_scan_warp_scans.cuh
+5 −5 cub/block/specializations/block_scan_warp_scans2.cuh
+10 −10 cub/block/specializations/block_scan_warp_scans3.cuh
+0 −2 cub/device/device_reduce.cuh
+855 −855 cub/device/device_segmented_radix_sort.cuh
+8 −8 cub/device/dispatch/dispatch_radix_sort.cuh
+4 −4 cub/grid/grid_barrier.cuh
+0 −16 cub/thread/thread_load.cuh
+4 −1 cub/thread/thread_operators.cuh
+0 −10 cub/thread/thread_store.cuh
+3 −0 cub/util_arch.cuh
+25 −25 cub/util_debug.cuh
+347 −347 cub/util_device.cuh
+159 −221 cub/util_ptx.cuh
+115 −39 cub/warp/specializations/warp_reduce_shfl.cuh
+19 −3 cub/warp/specializations/warp_reduce_smem.cuh
+103 −15 cub/warp/specializations/warp_scan_shfl.cuh
+41 −2 cub/warp/specializations/warp_scan_smem.cuh
+1 −0 test/Makefile
+9 −1 test/test_device_histogram.cu
+1 −0 test/test_device_scan.cu
+0 −4 test/test_warp_reduce.cu

0 comments on commit 78fbcaa

Please sign in to comment.