xmrig · xmrig · Nov 6, 2018 · Nov 6, 2018
diff --git a/src/amd/OclGPU.cpp b/src/amd/OclGPU.cpp
@@ -437,6 +437,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, xmrig::Config *config)
     const char *fastIntMathV2CL =
         #include "./opencl/fast_int_math_v2.cl"
     ;
+    const char *fastDivHeavyCL =
+        #include "./opencl/fast_div_heavy.cl"
+    ;
 
     std::string source_code(cryptonightCL);
     source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_AES"),         wolfAesCL);
@@ -445,6 +448,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, xmrig::Config *config)
     source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_BLAKE256"),         blake256CL);
     source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_GROESTL256"),       groestl256CL);
     source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
+    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
 
     for (size_t i = 0; i < num_gpus; ++i) {
         if (ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) {

diff --git a/src/amd/opencl/cryptonight.cl b/src/amd/opencl/cryptonight.cl
@@ -90,6 +90,8 @@ XMRIG_INCLUDE_BLAKE256
 XMRIG_INCLUDE_GROESTL256
 //#include "fast_int_math_v2.cl"
 XMRIG_INCLUDE_FAST_INT_MATH_V2
+//#include "fast_div_heavy.cl"
+XMRIG_INCLUDE_FAST_DIV_HEAVY
 
 
 #define VARIANT_0    0  // Original CryptoNight or CryptoNight-Heavy
@@ -922,7 +924,7 @@ __kernel void cn1_tube(__global uint4 *Scratchpad, __global ulong *states, ulong
             {
                 long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
                 int d  = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
-                long q = n / (d | 0x5);
+                long q = fast_div_heavy(n, d | 0x5);
                 *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
                 idx0 = d ^ q;
             }
@@ -1011,7 +1013,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
             {
                 long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
                 int d  = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
-                long q = n / (d | 0x5);
+                long q = fast_div_heavy(n, d | 0x5);
                 *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
 
                 if (variant == VARIANT_XHV) {

diff --git a/src/amd/opencl/fast_div_heavy.cl b/src/amd/opencl/fast_div_heavy.cl
@@ -0,0 +1,52 @@
+R"===(
+#ifndef FAST_DIV_HEAVY_CL
+#define FAST_DIV_HEAVY_CL
+
+inline ulong get_reciprocal_heavy(uint a)
+{
+	const uint shift = clz(a);
+	a <<= shift;
+
+	const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23));
+	const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256);
+
+	const float r = native_recip(a_hi);
+
+	const uint tmp0 = as_uint(r);
+	const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23);
+	const float r_scaled = as_float(tmp1);
+
+	const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
+
+	const float r_scaled_hi = as_float(tmp1 & ~4095U);
+	const float h_hi = as_float(as_uint(h) & ~4095U);
+
+	const float r_scaled_lo = r_scaled - r_scaled_hi;
+	const float h_lo = h - h_hi;
+
+	const float x1 = h_hi * r_scaled_hi;
+	const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo;
+
+	const long h1 = convert_long_rte(x1);
+	const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2));
+
+	const ulong result = tmp0 & 0xFFFFFF;
+	return (result << (shift + 9)) - ((h1 + h2) >> 2);
+}
+
+inline long fast_div_heavy(long _a, int _b)
+{
+	const ulong a = abs(_a);
+	const uint b = abs(_b);
+	ulong q = mul_hi(a, get_reciprocal_heavy(b));
+
+	const long tmp = a - q * b;
+	const int overshoot = (tmp < 0) ? 1 : 0;
+	const int undershoot = (tmp >= b) ? 1 : 0;
+	q += undershoot - overshoot;
+
+	return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
+}
+
+#endif
+)==="