Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replacing CudaAsyncBuffer with TArray to improve perf #3303

Merged
merged 6 commits into from
Mar 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions onnxruntime/contrib_ops/cuda/activation/activations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@ namespace cuda {
Status x<T>::ComputeInternal(OpKernelContext* context) const { \
UnaryElementwisePreparation p; \
UnaryElementwise::Prepare(context, &p); \
CudaAsyncBuffer<Ctx##x> func_ctx(this, MakeFuncCtx(), 1); \
if (!std::is_same<CtxNull, Ctx##x>::value) ORT_RETURN_IF_ERROR(func_ctx.CopyToGpu()); \
Ctx##x func_ctx = MakeFuncCtx(); \
Impl_##x<typename ToCudaType<T>::MappedType>( \
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
func_ctx.GpuPtr(), p.output_tensor->Shape().Size()); \
&func_ctx, p.output_tensor->Shape().Size()); \
\
return Status::OK(); \
}
Expand Down
5 changes: 2 additions & 3 deletions onnxruntime/core/providers/cuda/activation/activations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,11 @@ namespace cuda {
Status x<T>::ComputeInternal(OpKernelContext* context) const { \
UnaryElementwisePreparation p; \
UnaryElementwise::Prepare(context, &p); \
CudaAsyncBuffer<Ctx##x> func_ctx(this, MakeFuncCtx(), 1); \
if (!std::is_same<CtxNull, Ctx##x>::value) ORT_RETURN_IF_ERROR(func_ctx.CopyToGpu()); \
Ctx##x func_ctx = MakeFuncCtx(); \
Impl_##x<typename ToCudaType<T>::MappedType>( \
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
func_ctx.GpuPtr(), p.output_tensor->Shape().Size()); \
&func_ctx, p.output_tensor->Shape().Size()); \
\
return Status::OK(); \
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ template <typename InT, typename OutT, typename FuncT, int NumThreadsPerBlock, i
__global__ void _UnaryElementWise(
const InT* input_data,
OutT* output_data,
const FuncT& functor,
const FuncT functor,
CUDA_LONG N) {
CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
InT value[NumElementsPerThread];
Expand Down
9 changes: 4 additions & 5 deletions onnxruntime/core/providers/cuda/math/topk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,16 @@ TopK<inputk>::TopK(const OpKernelInfo& info) : CudaKernel(info) {
#define TOPKIMPL(T) TopKImpl<T>(this, tensor_X->Data<T>(), \
static_cast<T*>(tensor_V->MutableDataRaw()), \
static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
elem_nums_cuda.GpuPtr(), \
elem_nums_cuda, \
elem_nums.size(), \
axis, K_, largest_, sorted_, N, dimension)

template <bool inputk>
Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
auto tensor_X = ctx->Input<Tensor>(0);
ORT_ENFORCE(nullptr != tensor_X);
auto rank = static_cast<int64_t>(tensor_X->Shape().NumDimensions());
auto axis = axis_ < 0 ? rank + axis_ : axis_;
int32_t rank = static_cast<int32_t>(tensor_X->Shape().NumDimensions());
int32_t axis = static_cast<int32_t>(axis_ < 0 ? rank + axis_ : axis_);
ORT_ENFORCE(axis > -1 && axis < rank);

if (inputk) {
Expand All @@ -80,8 +80,7 @@ Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
}

auto N = elem_nums[0] / dimension;
CudaAsyncBuffer<int64_t> elem_nums_cuda(this, elem_nums);
ORT_RETURN_IF_ERROR(elem_nums_cuda.CopyToGpu());
TArray<int64_t> elem_nums_cuda(elem_nums);

auto prim_type = tensor_X->DataType()->AsPrimitiveDataType();
if (prim_type == nullptr) {
Expand Down
14 changes: 7 additions & 7 deletions onnxruntime/core/providers/cuda/math/topk_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ struct KV {
#define LESS(n, m) ((n) <= (m) ? (n) : (m))

template <typename T>
__global__ void BitonicTopK(const T* X, T* V, int64_t* I, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t aligned_K, int64_t largest, int64_t sorted, int64_t dimension, int64_t aligned_dimension, T type_min, T type_max) {
__global__ void BitonicTopK(const T* X, T* V, int64_t* I, const TArray<int64_t> elem_nums, size_t size, int32_t axis, int64_t K, int64_t aligned_K, int64_t largest, int64_t sorted, int64_t dimension, int64_t aligned_dimension, T type_min, T type_max) {
auto tid = threadIdx.x;
auto bid = blockIdx.x;
extern __shared__ char shared_mem[];
Expand Down Expand Up @@ -192,7 +192,7 @@ __device__ void SetByte(double* d, int64_t byte) {
}

template<typename T, int64_t THREADS, int64_t KPT>
__global__ void RadixTopK(const T* X, T* V, int64_t* I, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t dimension, int64_t XPT, T type_min, T type_max) {
__global__ void RadixTopK(const T* X, T* V, int64_t* I, const TArray<int64_t> elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t dimension, int64_t XPT, T type_min, T type_max) {
auto tid = threadIdx.x;
auto bid = blockIdx.x;
extern __shared__ char shared_mem[];
Expand Down Expand Up @@ -342,7 +342,7 @@ __global__ void RadixTopK(const T* X, T* V, int64_t* I, const int64_t* elem_nums
}

template <typename T>
__global__ void FillInput(const T* input_x, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t offset, int64_t dimension) {
__global__ void FillInput(const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t> elem_nums, size_t size, int32_t axis, int64_t K, int64_t offset, int64_t dimension) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, dimension);
auto left = offset / (axis == size - 1 ? 1 : elem_nums[axis + 1]) * elem_nums[axis];
auto right = axis == size - 1 ? 0 : offset % elem_nums[axis + 1];
Expand All @@ -352,7 +352,7 @@ __global__ void FillInput(const T* input_x, T* output_v, int64_t* output_i, cons
}

template <typename T>
__global__ void FillOutput(const T* input_v, const int64_t* input_i, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t offset, int64_t dimension) {
__global__ void FillOutput(const T* input_v, const int64_t* input_i, T* output_v, int64_t* output_i, const TArray<int64_t> elem_nums, size_t size, int32_t axis, int64_t K, int64_t offset, int64_t dimension) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, K);
auto left = offset / (axis == size - 1 ? 1 : elem_nums[axis + 1]) * elem_nums[axis] * K / dimension;
auto right = axis == size - 1 ? 0 : offset % elem_nums[axis + 1];
Expand All @@ -369,7 +369,7 @@ __global__ void ExcludeOutput(int64_t* output_i, int64_t K, int64_t dimension) {
}

template <typename T>
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) {
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) {
auto aligned_K = ALIGN(K);
auto aligned_dimension = ALIGN(dimension);
if (aligned_dimension <= GridDim::maxThreadsPerBlock) {
Expand Down Expand Up @@ -419,9 +419,9 @@ Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t
const T* input_x, \
T* output_v, \
int64_t* output_i, \
const int64_t* elem_nums, \
const TArray<int64_t>& elem_nums, \
size_t size, \
int64_t axis, \
int32_t axis, \
int64_t K, \
int64_t largest, \
int64_t sorted, \
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/cuda/math/topk_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace onnxruntime {
namespace cuda {

template <typename T>
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);

} // namespace cuda
} // namespace onnxruntime
20 changes: 12 additions & 8 deletions onnxruntime/core/providers/cuda/tensor/expand.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,17 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const {
CalcEffectiveDims(input_dims, output_dims);
int rank = gsl::narrow_cast<int>(output_dims.size());

CudaAsyncBuffer<fast_divmod> fdm_output_strides(this, rank);
ORT_ENFORCE(CalculateFdmStrides(fdm_output_strides.CpuSpan(), output_dims));
TensorPitches original_input_strides(input_dims);
TensorPitches original_output_strides(output_dims);

CudaAsyncBuffer<int64_t> input_view_strides(this, rank);
TensorPitches::Calculate(input_view_strides.CpuSpan(), input_dims);
for (int i = 0; i < rank; ++i) {
if (input_dims[i] == 1) input_view_strides.CpuSpan()[i] = 0;
TArray<int64_t> input_strides(rank);
for (auto i = 0; i < rank; i++) {
input_strides[i] = input_dims[i] == 1 ? 0 : original_input_strides[i];
}

TArray<fast_divmod> output_strides(rank);
for (auto i = 0; i < rank; i++) {
output_strides[i] = fast_divmod(static_cast<int>(original_output_strides[i]));
}

return ExpandImpl(
Expand All @@ -99,8 +103,8 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const {
gsl::narrow_cast<int>(input_data_tensor.Shape().Size()),
input_data_tensor.DataRaw(),
output_tensor.MutableDataRaw(),
fdm_output_strides,
input_view_strides);
output_strides,
input_strides);
}


Expand Down
24 changes: 11 additions & 13 deletions onnxruntime/core/providers/cuda/tensor/expand_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ __global__ void ExpandKernel(
const int N,
const T* input_data,
T* output_data,
const fast_divmod* fdm_output_strides,
const int64_t* input_view_strides) {
const TArray<fast_divmod> output_strides,
const TArray<int64_t> input_strides) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);

int dim, r = id, input_index = 0;
for (int i = 0; i < rank; ++i) {
fdm_output_strides[i].divmod(r, dim, r);
input_index += dim * input_view_strides[i];
output_strides[i].divmod(r, dim, r);
input_index += dim * input_strides[i];
}
output_data[id] = input_data[input_index];
}
Expand Down Expand Up @@ -114,9 +114,9 @@ Status ExpandImpl(
const int N_input,
const void* input_data,
void* output_data,
CudaKernel::CudaAsyncBuffer<fast_divmod>& fdm_output_strides,
CudaKernel::CudaAsyncBuffer<int64_t>& input_view_strides) {
const int rank = static_cast<int>(fdm_output_strides.count());
const TArray<fast_divmod>& output_strides,
const TArray<int64_t>& input_strides) {
const int rank = static_cast<int>(output_strides.size_);
if (rank == 1) {
if (N_input == N_output) {
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, N_output * element_size, cudaMemcpyDeviceToDevice));
Expand All @@ -125,20 +125,18 @@ Status ExpandImpl(
}
} else if (rank == 2) {
return Expand2D(element_size, N_output, input_data, output_data,
fdm_output_strides.CpuSpan()[0],
static_cast<int>(input_view_strides.CpuSpan()[0]),
static_cast<int>(input_view_strides.CpuSpan()[1]));
output_strides[0],
static_cast<int>(input_strides[0]),
static_cast<int>(input_strides[1]));
}

int blocksPerGrid = gsl::narrow_cast<int>(CeilDiv(N_output, GridDim::maxThreadsPerBlock));
fdm_output_strides.CopyToGpu();
input_view_strides.CopyToGpu();

#define EXPAND_ON(TYPE) \
case sizeof(TYPE): \
ExpandKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>( \
rank, N_output, reinterpret_cast<const TYPE*>(input_data), reinterpret_cast<TYPE*>(output_data), \
fdm_output_strides.GpuPtr(), input_view_strides.GpuPtr()); \
output_strides, input_strides); \
break

switch (element_size) {
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/cuda/tensor/expand_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ Status ExpandImpl(
const int N_input,
const void* input_data,
void* output_data,
CudaKernel::CudaAsyncBuffer<fast_divmod>& fdm_output_strides,
CudaKernel::CudaAsyncBuffer<int64_t>& input_view_strides);
const TArray<fast_divmod>& output_strides,
const TArray<int64_t>& input_strides);


} // namespace cuda
Expand Down
22 changes: 11 additions & 11 deletions onnxruntime/core/providers/cuda/tensor/gather_elements.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const {
const auto* indices_tensor = context->Input<Tensor>(1);
const auto& indices_shape = indices_tensor->Shape();
const auto& indices_dims = indices_shape.GetDims();
const int64_t indices_rank = static_cast<int64_t>(indices_dims.size());
const int32_t indices_rank = static_cast<int32_t>(indices_dims.size());
const int64_t indices_size = indices_shape.Size();

// Handle negative axis if any
Expand All @@ -51,13 +51,13 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const {
return Status::OK();

TensorPitches input_strides(input_dims);
CudaAsyncBuffer<int64_t> gpu_input_strides(this, input_strides);
TArray<int64_t> gpu_input_strides(input_strides);

CudaAsyncBuffer<fast_divmod> fdm_indices_strides(this, indices_rank);
ORT_ENFORCE(CalculateFdmStrides(fdm_indices_strides.CpuSpan(), indices_dims));

ORT_RETURN_IF_ERROR(gpu_input_strides.CopyToGpu());
ORT_RETURN_IF_ERROR(fdm_indices_strides.CopyToGpu());
TArray<fast_divmod> fdm_indices_strides(indices_rank);
TensorPitches indices_strides(indices_dims);
for (auto i = 0; i < indices_rank; i++) {
fdm_indices_strides[i] = fast_divmod(static_cast<int>(indices_strides[i]));
}

size_t element_size = input_tensor->DataType()->Size();

Expand All @@ -67,10 +67,10 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const {
input_rank,
input_tensor->DataRaw(),
input_dims[axis],
gpu_input_strides.GpuPtr(),
gpu_input_strides,
indices_data,
indices_size,
fdm_indices_strides.GpuPtr(),
fdm_indices_strides,
axis,
output_tensor->MutableDataRaw(),
element_size);
Expand All @@ -81,10 +81,10 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const {
input_rank,
input_tensor->DataRaw(),
input_dims[axis],
gpu_input_strides.GpuPtr(),
gpu_input_strides,
indices_data,
indices_size,
fdm_indices_strides.GpuPtr(),
fdm_indices_strides,
axis,
output_tensor->MutableDataRaw(),
element_size);
Expand Down
16 changes: 8 additions & 8 deletions onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ __global__ void _GatherElementsKernel(
const int64_t rank,
const T* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t> input_strides,
const Tin* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod> indices_strides,
const int64_t axis,
T* output_data) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(indices_index, indices_size);
Expand Down Expand Up @@ -43,10 +43,10 @@ void GatherElementsImpl(
const int64_t rank,
const void* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t>& input_strides,
const Tin* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod>& indices_strides,
const int64_t axis,
void* output_data,
size_t element_size) {
Expand Down Expand Up @@ -95,10 +95,10 @@ template void GatherElementsImpl<int32_t>(
const int64_t rank,
const void* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t>& input_strides,
const int32_t* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod>& indices_strides,
const int64_t axis,
void* output_data,
size_t element_size);
Expand All @@ -107,10 +107,10 @@ template void GatherElementsImpl<int64_t>(
const int64_t rank,
const void* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t>& input_strides,
const int64_t* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod>& indices_strides,
const int64_t axis,
void* output_data,
size_t element_size);
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ void GatherElementsImpl(
const int64_t rank, // both inputs have same rank and this is validated in the main Compute
const void* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t>& input_strides,
const Tin* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod>& indices_strides,
const int64_t axis,
void* output_data,
size_t element_size);
Expand Down
Loading