Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized division for CN-Heavy #185

Merged
merged 1 commit into from
Nov 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/amd/OclGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, xmrig::Config *config)
const char *fastIntMathV2CL =
#include "./opencl/fast_int_math_v2.cl"
;
const char *fastDivHeavyCL =
#include "./opencl/fast_div_heavy.cl"
;

std::string source_code(cryptonightCL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_AES"), wolfAesCL);
Expand All @@ -445,6 +448,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, xmrig::Config *config)
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_BLAKE256"), blake256CL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_GROESTL256"), groestl256CL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);

for (size_t i = 0; i < num_gpus; ++i) {
if (ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) {
Expand Down
6 changes: 4 additions & 2 deletions src/amd/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ XMRIG_INCLUDE_BLAKE256
XMRIG_INCLUDE_GROESTL256
//#include "fast_int_math_v2.cl"
XMRIG_INCLUDE_FAST_INT_MATH_V2
//#include "fast_div_heavy.cl"
XMRIG_INCLUDE_FAST_DIV_HEAVY


#define VARIANT_0 0 // Original CryptoNight or CryptoNight-Heavy
Expand Down Expand Up @@ -922,7 +924,7 @@ __kernel void cn1_tube(__global uint4 *Scratchpad, __global ulong *states, ulong
{
long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
long q = n / (d | 0x5);
long q = fast_div_heavy(n, d | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
idx0 = d ^ q;
}
Expand Down Expand Up @@ -1011,7 +1013,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
{
long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
long q = n / (d | 0x5);
long q = fast_div_heavy(n, d | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;

if (variant == VARIANT_XHV) {
Expand Down
52 changes: 52 additions & 0 deletions src/amd/opencl/fast_div_heavy.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
R"===(
#ifndef FAST_DIV_HEAVY_CL
#define FAST_DIV_HEAVY_CL

inline ulong get_reciprocal_heavy(uint a)
{
const uint shift = clz(a);
a <<= shift;

const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23));
const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256);

const float r = native_recip(a_hi);

const uint tmp0 = as_uint(r);
const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23);
const float r_scaled = as_float(tmp1);

const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));

const float r_scaled_hi = as_float(tmp1 & ~4095U);
const float h_hi = as_float(as_uint(h) & ~4095U);

const float r_scaled_lo = r_scaled - r_scaled_hi;
const float h_lo = h - h_hi;

const float x1 = h_hi * r_scaled_hi;
const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo;

const long h1 = convert_long_rte(x1);
const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2));

const ulong result = tmp0 & 0xFFFFFF;
return (result << (shift + 9)) - ((h1 + h2) >> 2);
}

inline long fast_div_heavy(long _a, int _b)
{
const ulong a = abs(_a);
const uint b = abs(_b);
ulong q = mul_hi(a, get_reciprocal_heavy(b));

const long tmp = a - q * b;
const int overshoot = (tmp < 0) ? 1 : 0;
const int undershoot = (tmp >= b) ? 1 : 0;
q += undershoot - overshoot;

return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
}

#endif
)==="