Skip to content

Commit

Permalink
static build option - statistical_operations enabled (ROCm#52)
Browse files Browse the repository at this point in the history
* Mods for max/min/thresholding

* Merge branch 'ar/master' into ar/hip_stat_opt

* Remove fork CI
  • Loading branch information
r-abishek authored Apr 28, 2021
1 parent 0416bde commit 1ddd6a1
Show file tree
Hide file tree
Showing 5 changed files with 310 additions and 122 deletions.
32 changes: 30 additions & 2 deletions src/modules/hip/hip_statistical_operations.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "hip_declarations.hpp"
#include "kernel/rpp_hip_host_decls.hpp"

/******************** thresholding ********************/

Expand Down Expand Up @@ -29,6 +30,9 @@ thresholding_hip_batch(Rpp8u* srcPtr, Rpp8u* dstPtr, rpp::Handle& handle, RppiCh
plnpkdind = 3;
Rpp32u max_height, max_width;
max_size(handle.GetInitHandle()->mem.mgpu.csrcSize.height, handle.GetInitHandle()->mem.mgpu.csrcSize.width, handle.GetBatchSize(), &max_height, &max_width);

#if defined (HIPRTC)

std::vector<size_t> vld{32, 32, 1};
std::vector<size_t> vgd{(max_width + 31) & ~31, (max_height + 31) & ~31, handle.GetBatchSize()};

Expand All @@ -48,13 +52,19 @@ thresholding_hip_batch(Rpp8u* srcPtr, Rpp8u* dstPtr, rpp::Handle& handle, RppiCh
handle.GetInitHandle()->mem.mgpu.inc,
plnpkdind);

#elif defined(STATIC)

hip_exec_thresholding_batch(srcPtr, dstPtr, handle, chnFormat, channel, plnpkdind, max_height, max_width);

#endif

return RPP_SUCCESS;
}

/******************** min ********************/

RppStatus
min_hip(Rpp8u* srcPtr1,Rpp8u* srcPtr2, RppiSize srcSize, Rpp8u* dstPtr, RppiChnFormat chnFormat, unsigned int channel, rpp::Handle& handle)
min_hip(Rpp8u* srcPtr1, Rpp8u* srcPtr2, RppiSize srcSize, Rpp8u* dstPtr, RppiChnFormat chnFormat, unsigned int channel, rpp::Handle& handle)
{
std::vector<size_t> vld{32, 32, 1};
std::vector<size_t> vgd{(srcSize.width + 31) & ~31, (srcSize.height + 31) & ~31, channel};
Expand All @@ -70,7 +80,7 @@ min_hip(Rpp8u* srcPtr1,Rpp8u* srcPtr2, RppiSize srcSize, Rpp8u* dstPtr, RppiChnF
}

RppStatus
min_hip_batch(Rpp8u* srcPtr1,Rpp8u* srcPtr2, Rpp8u* dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, unsigned int channel)
min_hip_batch(Rpp8u* srcPtr1, Rpp8u* srcPtr2, Rpp8u* dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, unsigned int channel)
{
int plnpkdind;
if(chnFormat == RPPI_CHN_PLANAR)
Expand All @@ -79,6 +89,9 @@ min_hip_batch(Rpp8u* srcPtr1,Rpp8u* srcPtr2, Rpp8u* dstPtr, rpp::Handle& handle,
plnpkdind = 3;
Rpp32u max_height, max_width;
max_size(handle.GetInitHandle()->mem.mgpu.csrcSize.height, handle.GetInitHandle()->mem.mgpu.csrcSize.width, handle.GetBatchSize(), &max_height, &max_width);

#if defined (HIPRTC)

std::vector<size_t> vld{32, 32, 1};
std::vector<size_t> vgd{(max_width + 31) & ~31, (max_height + 31) & ~31, handle.GetBatchSize()};

Expand All @@ -97,6 +110,12 @@ min_hip_batch(Rpp8u* srcPtr1,Rpp8u* srcPtr2, Rpp8u* dstPtr, rpp::Handle& handle,
handle.GetInitHandle()->mem.mgpu.inc,
plnpkdind);

#elif defined(STATIC)

hip_exec_min_batch(srcPtr1, srcPtr2, dstPtr, handle, chnFormat, channel, plnpkdind, max_height, max_width);

#endif

return RPP_SUCCESS;
}

Expand Down Expand Up @@ -128,6 +147,9 @@ max_hip_batch(Rpp8u* srcPtr1,Rpp8u* srcPtr2, Rpp8u* dstPtr, rpp::Handle& handle,
plnpkdind = 3;
Rpp32u max_height, max_width;
max_size(handle.GetInitHandle()->mem.mgpu.csrcSize.height, handle.GetInitHandle()->mem.mgpu.csrcSize.width, handle.GetBatchSize(), &max_height, &max_width);

#if defined (HIPRTC)

std::vector<size_t> vld{32, 32, 1};
std::vector<size_t> vgd{(max_width + 31) & ~31, (max_height + 31) & ~31, handle.GetBatchSize()};

Expand All @@ -146,6 +168,12 @@ max_hip_batch(Rpp8u* srcPtr1,Rpp8u* srcPtr2, Rpp8u* dstPtr, rpp::Handle& handle,
handle.GetInitHandle()->mem.mgpu.inc,
plnpkdind);

#elif defined(STATIC)

hip_exec_max_batch(srcPtr1, srcPtr2, dstPtr, handle, chnFormat, channel, plnpkdind, max_height, max_width);

#endif

return RPP_SUCCESS;
}

Expand Down
130 changes: 91 additions & 39 deletions src/modules/hip/kernel/max.cpp
Original file line number Diff line number Diff line change
@@ -1,61 +1,113 @@
#include <hip/hip_runtime.h>
#define saturate_8u(value) ( (value) > 255 ? 255 : ((value) < 0 ? 0 : (value) ))
extern "C" __global__ void max_hip( unsigned char* input1,
unsigned char* input2,
unsigned char* output,
const unsigned int height,
const unsigned int width,
const unsigned int channel
)

#if defined(STATIC)
#include "rpp_hip_host_decls.hpp"
#endif

#define saturate_8u(value) ((value) > 255 ? 255 : ((value) < 0 ? 0 : (value)))

__device__ unsigned char max_formula(unsigned char input_pixel1, unsigned char input_pixel2)
{
int id_x = hipBlockIdx_x *hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y *hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z *hipBlockDim_z + hipThreadIdx_z;
if (id_x >= width || id_y >= height || id_z >= channel) return;
return saturate_8u((input_pixel1 >= input_pixel2) ? input_pixel1 : input_pixel2);
}

extern "C" __global__ void max_hip(unsigned char *input1,
unsigned char *input2,
unsigned char *output,
const unsigned int height,
const unsigned int width,
const unsigned int channel)
{
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

if (id_x >= width || id_y >= height || id_z >= channel)
{
return;
}

int pixIdx = id_x + id_y * width + id_z * width * height;

output[pixIdx] = (input1[pixIdx] > input2[pixIdx]) ? input2[pixIdx] : input1[pixIdx];
}

__device__ unsigned char max_formula( unsigned char input_pixel1, unsigned char input_pixel2){
return saturate_8u((input_pixel1 >= input_pixel2) ? input_pixel1 : input_pixel2);
}
extern "C" __global__ void max_batch( unsigned char* input1,
unsigned char* input2,
unsigned char* output,
int *xroi_begin,
int *xroi_end,
int *yroi_begin,
int *yroi_end,

extern "C" __global__ void max_batch(unsigned char *input1,
unsigned char *input2,
unsigned char *output,
unsigned int *xroi_begin,
unsigned int *xroi_end,
unsigned int *yroi_begin,
unsigned int *yroi_end,
unsigned int *height,
unsigned int *width,
unsigned int *max_width,
unsigned long *batch_index,
const unsigned int channel,
unsigned int *inc, // use width * height for pln and 1 for pkd
const int plnpkdindex // use 1 pln 3 for pkd
)
unsigned long long *batch_index,
const unsigned int channel,
unsigned int *inc, // use width * height for pln and 1 for pkd
const int plnpkdindex) // use 1 pln 3 for pkd
{
int id_x = hipBlockIdx_x *hipBlockDim_x + hipThreadIdx_x, id_y = hipBlockIdx_y *hipBlockDim_y + hipThreadIdx_y, id_z = hipBlockIdx_z *hipBlockDim_z + hipThreadIdx_z;
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

unsigned char valuergb1, valuergb2;
int indextmp=0;
int indextmp = 0;
long pixIdx = 0;

pixIdx = batch_index[id_z] + (id_x + id_y * max_width[id_z] ) * plnpkdindex ;
if((id_y >= yroi_begin[id_z] ) && (id_y <= yroi_end[id_z]) && (id_x >= xroi_begin[id_z]) && (id_x <= xroi_end[id_z]))
{
for(indextmp = 0; indextmp < channel; indextmp++){
pixIdx = batch_index[id_z] + (id_x + id_y * max_width[id_z]) * plnpkdindex;

if((id_y >= yroi_begin[id_z]) && (id_y <= yroi_end[id_z]) && (id_x >= xroi_begin[id_z]) && (id_x <= xroi_end[id_z]))
{
for(indextmp = 0; indextmp < channel; indextmp++)
{
valuergb1 = input1[pixIdx];
valuergb2 = input2[pixIdx];
output[pixIdx] = max_formula(valuergb1, valuergb2);
pixIdx += inc[id_z];
}
}
else if((id_x < width[id_z] ) && (id_y < height[id_z])){
for(indextmp = 0; indextmp < channel; indextmp++){
output[pixIdx] = input1[pixIdx];
pixIdx += inc[id_z];
}
else if((id_x < width[id_z]) && (id_y < height[id_z]))
{
for(indextmp = 0; indextmp < channel; indextmp++)
{
output[pixIdx] = input1[pixIdx];
pixIdx += inc[id_z];
}
}
}
}

#if defined(STATIC)
RppStatus hip_exec_max_batch(Rpp8u *srcPtr1, Rpp8u *srcPtr2, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width)
{
int localThreads_x = 32;
int localThreads_y = 32;
int localThreads_z = 1;
int globalThreads_x = (max_width + 31) & ~31;
int globalThreads_y = (max_height + 31) & ~31;
int globalThreads_z = handle.GetBatchSize();

hipLaunchKernelGGL(max_batch,
dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
0,
handle.GetStream(),
srcPtr1,
srcPtr2,
dstPtr,
handle.GetInitHandle()->mem.mgpu.roiPoints.x,
handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth,
handle.GetInitHandle()->mem.mgpu.roiPoints.y,
handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight,
handle.GetInitHandle()->mem.mgpu.srcSize.height,
handle.GetInitHandle()->mem.mgpu.srcSize.width,
handle.GetInitHandle()->mem.mgpu.maxSrcSize.width,
handle.GetInitHandle()->mem.mgpu.srcBatchIndex,
channel,
handle.GetInitHandle()->mem.mgpu.inc,
plnpkdind);

return RPP_SUCCESS;
}
#endif
127 changes: 89 additions & 38 deletions src/modules/hip/kernel/min.cpp
Original file line number Diff line number Diff line change
@@ -1,61 +1,112 @@
#include <hip/hip_runtime.h>
#define saturate_8u(value) ( (value) > 255 ? 255 : ((value) < 0 ? 0 : (value) ))
extern "C" __global__ void min_hip( unsigned char* input1,
unsigned char* input2,
unsigned char* output,
const unsigned int height,
const unsigned int width,
const unsigned int channel
)

#if defined(STATIC)
#include "rpp_hip_host_decls.hpp"
#endif

#define saturate_8u(value) ((value) > 255 ? 255 : ((value) < 0 ? 0 : (value)))

__device__ unsigned char min_formula(unsigned char input_pixel1, unsigned char input_pixel2)
{
int id_x = hipBlockIdx_x *hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y *hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z *hipBlockDim_z + hipThreadIdx_z;
if (id_x >= width || id_y >= height || id_z >= channel) return;
return saturate_8u((input_pixel1 <= input_pixel2) ? input_pixel1 : input_pixel2);
}

extern "C" __global__ void min_hip(unsigned char *input1,
unsigned char *input2,
unsigned char *output,
const unsigned int height,
const unsigned int width,
const unsigned int channel)
{
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

if (id_x >= width || id_y >= height || id_z >= channel)
{
return;
}

int pixIdx = id_x + id_y * width + id_z * width * height;

output[pixIdx] = (input1[pixIdx] > input2[pixIdx]) ? input2[pixIdx] : input1[pixIdx];
}

__device__ unsigned char min_formula( unsigned char input_pixel1, unsigned char input_pixel2){
return saturate_8u((input_pixel1 <= input_pixel2) ? input_pixel1 : input_pixel2);
}
extern "C" __global__ void min_batch( unsigned char* input1,
unsigned char* input2,
unsigned char* output,
int *xroi_begin,
int *xroi_end,
int *yroi_begin,
int *yroi_end,
extern "C" __global__ void min_batch(unsigned char *input1,
unsigned char *input2,
unsigned char *output,
unsigned int *xroi_begin,
unsigned int *xroi_end,
unsigned int *yroi_begin,
unsigned int *yroi_end,
unsigned int *height,
unsigned int *width,
unsigned int *max_width,
unsigned long *batch_index,
const unsigned int channel,
unsigned int *inc, // use width * height for pln and 1 for pkd
const int plnpkdindex // use 1 pln 3 for pkd
)
unsigned long long *batch_index,
const unsigned int channel,
unsigned int *inc, // use width * height for pln and 1 for pkd
const int plnpkdindex) // use 1 pln 3 for pkd
{
int id_x = hipBlockIdx_x *hipBlockDim_x + hipThreadIdx_x, id_y = hipBlockIdx_y *hipBlockDim_y + hipThreadIdx_y, id_z = hipBlockIdx_z *hipBlockDim_z + hipThreadIdx_z;
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

unsigned char valuergb1, valuergb2;
int indextmp=0;
int indextmp = 0;
long pixIdx = 0;

pixIdx = batch_index[id_z] + (id_x + id_y * max_width[id_z] ) * plnpkdindex ;
pixIdx = batch_index[id_z] + (id_x + id_y * max_width[id_z]) * plnpkdindex;

if((id_y >= yroi_begin[id_z] ) && (id_y <= yroi_end[id_z]) && (id_x >= xroi_begin[id_z]) && (id_x <= xroi_end[id_z]))
{
for(indextmp = 0; indextmp < channel; indextmp++){
{
for(indextmp = 0; indextmp < channel; indextmp++)
{
valuergb1 = input1[pixIdx];
valuergb2 = input2[pixIdx];
output[pixIdx] = min_formula(valuergb1, valuergb2);
pixIdx += inc[id_z];
}
}
else if((id_x < width[id_z] ) && (id_y < height[id_z])){
for(indextmp = 0; indextmp < channel; indextmp++){
output[pixIdx] = input1[pixIdx];
pixIdx += inc[id_z];
}
else if((id_x < width[id_z]) && (id_y < height[id_z]))
{
for(indextmp = 0; indextmp < channel; indextmp++)
{
output[pixIdx] = input1[pixIdx];
pixIdx += inc[id_z];
}
}
}
}

#if defined(STATIC)
RppStatus hip_exec_min_batch(Rpp8u *srcPtr1, Rpp8u *srcPtr2, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width)
{
int localThreads_x = 32;
int localThreads_y = 32;
int localThreads_z = 1;
int globalThreads_x = (max_width + 31) & ~31;
int globalThreads_y = (max_height + 31) & ~31;
int globalThreads_z = handle.GetBatchSize();

hipLaunchKernelGGL(min_batch,
dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
0,
handle.GetStream(),
srcPtr1,
srcPtr2,
dstPtr,
handle.GetInitHandle()->mem.mgpu.roiPoints.x,
handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth,
handle.GetInitHandle()->mem.mgpu.roiPoints.y,
handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight,
handle.GetInitHandle()->mem.mgpu.srcSize.height,
handle.GetInitHandle()->mem.mgpu.srcSize.width,
handle.GetInitHandle()->mem.mgpu.maxSrcSize.width,
handle.GetInitHandle()->mem.mgpu.srcBatchIndex,
channel,
handle.GetInitHandle()->mem.mgpu.inc,
plnpkdind);

return RPP_SUCCESS;
}
#endif
Loading

0 comments on commit 1ddd6a1

Please sign in to comment.