xmrig · xmrig · Sep 15, 2018 · Sep 13, 2018 · Sep 13, 2018 · Sep 14, 2018
diff --git a/src/amd/GpuContext.h b/src/amd/GpuContext.h
@@ -45,6 +45,7 @@ struct GpuContext
         stridedIndex(1),
         memChunk(2),
         compMode(1),
+        unrollFactor(8),
         DeviceID(nullptr),
         CommandQueues(nullptr),
         InputBuffer(nullptr),
@@ -58,13 +59,14 @@ struct GpuContext
     {}
 
 
-    inline GpuContext(size_t index, size_t intensity, size_t worksize, int stridedIndex, int memChunk, bool compMode) :
+    inline GpuContext(size_t index, size_t intensity, size_t worksize, int stridedIndex, int memChunk, bool compMode, int unrollFactor) :
         deviceIdx(index),
         rawIntensity(intensity),
         workSize(worksize),
         stridedIndex(stridedIndex),
         memChunk(memChunk),
         compMode(compMode ? 1 : 0),
+        unrollFactor(unrollFactor),
         DeviceID(nullptr),
         CommandQueues(nullptr),
         InputBuffer(nullptr),
@@ -84,6 +86,7 @@ struct GpuContext
     int stridedIndex;
     int memChunk;
     int compMode;
+    int unrollFactor;
 
     /*Output vars*/
     cl_device_id DeviceID;
@@ -92,7 +95,7 @@ struct GpuContext
     cl_mem OutputBuffer;
     cl_mem ExtraBuffers[6];
     cl_program Program;
-    cl_kernel Kernels[11];
+    cl_kernel Kernels[12];
     size_t freeMem;
     int computeUnits;
     std::string name;

diff --git a/src/amd/OclCLI.cpp b/src/amd/OclCLI.cpp
@@ -49,7 +49,7 @@ bool OclCLI::setup(std::vector<xmrig::IThread *> &threads)
     }
 
     for (size_t i = 0; i < m_devices.size(); i++) {
-        threads.push_back(new OclThread(m_devices[i], intensity(i), worksize(i), affinity(i)));
+        threads.push_back(new OclThread(m_devices[i], intensity(i), worksize(i), affinity(i), unrollFactor(i)));
     }
 
     return true;
@@ -136,13 +136,19 @@ void OclCLI::parseLaunch(const char *arg)
             else if (count == 2) {
                 m_worksize.push_back(v > 0 ? v : 8);
             }
+            else if (count == 3) {
+                m_unrollFactor.push_back(v > 0 ? v : 8);
+            }
 
             pch = strtok(nullptr, "x");
         }
 
-        if (count == 1) {
+        if (count < 2) {
             m_worksize.push_back(8);
         }
+        if (count < 3) {
+            m_unrollFactor.push_back(8);
+        }
     }
 
     free(value);

diff --git a/src/amd/OclCLI.h b/src/amd/OclCLI.h
@@ -57,6 +57,7 @@ class OclCLI
     inline int affinity(int index) const  { return get(m_affinity, index, -1); }
     inline int intensity(int index) const { return get(m_intensity, index, 0); }
     inline int worksize(int index) const  { return get(m_worksize, index, 8); }
+    inline int unrollFactor(int index) const { return get(m_unrollFactor, index, 8); }
 
     int get(const std::vector<int> &vector, int index, int defaultValue) const;
     void parse(std::vector<int> &vector, const char *arg) const;
@@ -65,6 +66,7 @@ class OclCLI
     std::vector<int> m_devices;
     std::vector<int> m_intensity;
     std::vector<int> m_worksize;
+    std::vector<int> m_unrollFactor;
 };
 
 

diff --git a/src/amd/OclCache.cpp b/src/amd/OclCache.cpp
@@ -52,18 +52,20 @@ OclCache::OclCache(int index, cl_context opencl_ctx, GpuContext *ctx, const char
 bool OclCache::load()
 {
     const xmrig::Algo algo  = m_config->algorithm().algo();
+
     const int64_t timeStart = xmrig::currentMSecsSinceEpoch();
 
     char options[512] = { 0 };
-    snprintf(options, sizeof(options), "-DITERATIONS=%u -DMASK=%u -DWORKSIZE=%zu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%zu -DALGO=%d",
+    snprintf(options, sizeof(options), "-DITERATIONS=%u -DMASK=%u -DWORKSIZE=%zu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%zu -DALGO=%d -DUNROLL_FACTOR=%d",
              xmrig::cn_select_iter(algo, xmrig::VARIANT_0),
              xmrig::cn_select_mask(algo),
              m_ctx->workSize,
              m_ctx->stridedIndex,
              static_cast<int>(1u << m_ctx->memChunk),
              m_ctx->compMode,
              xmrig::cn_select_memory(algo),
-             static_cast<int>(algo)
+             static_cast<int>(algo),
+             m_ctx->unrollFactor
              );
 
     if (!prepare(options)) {

diff --git a/src/amd/OclGPU.cpp b/src/amd/OclGPU.cpp
@@ -105,6 +105,9 @@ inline static int cnKernelOffset(uint32_t variant)
     case xmrig::VARIANT_TUBE:
         return 10;
 
+    case xmrig::VARIANT_2:
+        return 11;
+
     default:
         break;
     }
@@ -128,8 +131,8 @@ size_t InitOpenCLGpu(int index, cl_context opencl_ctx, GpuContext* ctx, const ch
     getDeviceName(ctx->DeviceID, buf, sizeof(buf));
     ctx->computeUnits = getDeviceMaxComputeUnits(ctx->DeviceID);
 
-    LOG_INFO(config->isColors() ? "\x1B[01;37m#%d\x1B[0m, GPU \x1B[01;37m#%zu\x1B[0m \x1B[01;32m%s\x1B[0m, intensity: \x1B[01;37m%zu\x1B[0m (%zu/%zu), cu: \x1B[01;37m%d"  : "#%d, GPU #%zu (%s), intensity: %zu (%zu/%zu), cu: %d",
-        index, ctx->deviceIdx, buf, ctx->rawIntensity, ctx->workSize, MaximumWorkSize, ctx->computeUnits);
+    LOG_INFO(config->isColors() ? "\x1B[01;37m#%d\x1B[0m, GPU \x1B[01;37m#%zu\x1B[0m \x1B[01;32m%s\x1B[0m, intensity: \x1B[01;37m%zu\x1B[0m (%zu/%zu), unroll: \x1B[01;37m%d, cu: \x1B[01;37m%d"  : "#%d, GPU #%zu (%s), intensity: %zu (%zu/%zu), unroll: %d, cu: %d",
+        index, ctx->deviceIdx, buf, ctx->rawIntensity, ctx->workSize, MaximumWorkSize, ctx->unrollFactor, ctx->computeUnits);
 
     ctx->CommandQueues = OclLib::createCommandQueue(opencl_ctx, ctx->DeviceID, &ret);
     if (ret != CL_SUCCESS) {
@@ -195,8 +198,8 @@ size_t InitOpenCLGpu(int index, cl_context opencl_ctx, GpuContext* ctx, const ch
         return OCL_ERR_API;
     }
 
-    const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero", "cn1_msr", "cn1_xao", "cn1_tube"};
-    for (int i = 0; i < 11; ++i) {
+    const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero", "cn1_msr", "cn1_xao", "cn1_tube", "cn1_v2_monero"};
+    for (int i = 0; i < 12; ++i) {
         ctx->Kernels[i] = OclLib::createKernel(ctx->Program, KernelNames[i], &ret);
         if (ret != CL_SUCCESS) {
             return OCL_ERR_API;
@@ -401,13 +404,17 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, xmrig::Config *config)
     const char *wolfSkeinCL =
             #include "./opencl/wolf-skein.cl"
     ;
+    const char *fastIntMathV2CL =
+        #include "./opencl/fast_int_math_v2.cl"
+    ;
 
     std::string source_code(cryptonightCL);
-    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_AES"),   wolfAesCL);
-    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
-    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_JH"),         jhCL);
-    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_BLAKE256"),   blake256CL);
-    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_GROESTL256"), groestl256CL);
+    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_AES"),         wolfAesCL);
+    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_SKEIN"),       wolfSkeinCL);
+    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_JH"),               jhCL);
+    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_BLAKE256"),         blake256CL);
+    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_GROESTL256"),       groestl256CL);
+    source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
 
     for (size_t i = 0; i < num_gpus; ++i) {
         if (ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) {

diff --git a/src/amd/opencl/cryptonight.cl b/src/amd/opencl/cryptonight.cl
@@ -88,6 +88,8 @@ XMRIG_INCLUDE_JH
 XMRIG_INCLUDE_BLAKE256
 //#include "opencl/groestl256.cl"
 XMRIG_INCLUDE_GROESTL256
+//#include "fast_int_math_v2.cl"
+XMRIG_INCLUDE_FAST_INT_MATH_V2
 
 
 #define VARIANT_0    0  // Original CryptoNight or CryptoNight-Heavy
@@ -580,7 +582,7 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
     if (gIdx < Threads)
 #   endif
     {
-        #pragma unroll 8
+        #pragma unroll UNROLL_FACTOR
         for (int i = 0; i < ITERATIONS; ++i) {
             ulong c[2];
 
@@ -615,6 +617,128 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
 }
 
 
+)==="
+R"===(
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void cn1_v2_monero(__global uint4 *Scratchpad, __global ulong *states, ulong Threads, uint variant, __global ulong *input)
+{
+	ulong a[2], b[4];
+	__local uint AES0[256], AES1[256], AES2[256], AES3[256], RCP[256];
+
+	const ulong gIdx = getIdx();
+
+	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+	{
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
+		AES2[i] = rotate(tmp, 16U);
+		AES3[i] = rotate(tmp, 24U);
+		RCP[i] = RCP_C[i];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+#	if (COMP_MODE == 1)
+	// do not use early return here
+	if (gIdx < Threads)
+#	endif
+	{
+		states += 25 * gIdx;
+#		if (STRIDED_INDEX == 0)
+		Scratchpad += gIdx * (MEMORY >> 4);
+#		elif (STRIDED_INDEX == 1)
+		Scratchpad += gIdx;
+#		elif (STRIDED_INDEX == 2)
+		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+#		endif
+
+		a[0] = states[0] ^ states[4];
+		a[1] = states[1] ^ states[5];
+
+		b[0] = states[2] ^ states[6];
+		b[1] = states[3] ^ states[7];
+		b[2] = states[8] ^ states[10];
+		b[3] = states[9] ^ states[11];
+	}
+
+	ulong2 bx0 = ((ulong2 *)b)[0];
+	ulong2 bx1 = ((ulong2 *)b)[1];
+
+	mem_fence(CLK_LOCAL_MEM_FENCE);
+
+#if (STRIDED_INDEX == 0)
+#define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4))))
+#elif (STRIDED_INDEX == 1)
+#define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4)) * as_uint2(Threads).s0))
+#elif (STRIDED_INDEX == 2)
+#define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (((idx ^ (N << 4)) % (MEM_CHUNK << 4)) + ((idx ^ (N << 4)) / (MEM_CHUNK << 4)) * WORKSIZE * (MEM_CHUNK << 4))))
+#endif
+
+#   if (COMP_MODE == 1)
+    // do not use early return here
+    if (gIdx < Threads)
+#   endif
+    {
+	uint2 division_result = as_uint2(states[12]);
+	uint sqrt_result = as_uint2(states[13]).s0;
+
+	#pragma unroll UNROLL_FACTOR
+	for(int i = 0; i < ITERATIONS; ++i)
+	{
+		uint idx = a[0] & MASK;
+		uint4 c = SCRATCHPAD_CHUNK(0);
+		c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);
+
+		{
+			const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+			const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+			const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
+			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+		}
+
+		SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c;
+
+		idx = as_ulong2(c).s0 & MASK;
+		uint4 tmp = SCRATCHPAD_CHUNK(0);
+
+		{
+			tmp.s0 ^= division_result.s0;
+			tmp.s1 ^= division_result.s1 ^ sqrt_result;
+
+			division_result = fast_div_v2((__local uchar *) RCP, as_ulong2(c).s1, (c.s0 + (sqrt_result << 1)) | 0x80000001UL);
+			sqrt_result = fast_sqrt_v2(as_ulong2(c).s0 + as_ulong(division_result));
+		}
+
+		{
+			const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+			const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+			const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
+			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+		}
+
+		a[1] += as_ulong2(c).s0 * as_ulong2(tmp).s0;
+		a[0] += mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0);
+
+		SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
+
+		((uint4 *)a)[0] ^= tmp;
+		bx1 = bx0;
+		bx0 = as_ulong2(c);
+	}
+
+#undef SCRATCHPAD_CHUNK
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
 )==="
 R"===(
 
@@ -757,7 +881,7 @@ __kernel void cn1_tube(__global uint4 *Scratchpad, __global ulong *states, ulong
     {
         ulong idx0 = a[0];
 
-        #pragma unroll 8
+        #pragma unroll UNROLL_FACTOR
         for (int i = 0; i < ITERATIONS; ++i) {
             ulong c[2];
 
@@ -852,7 +976,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
     {
         ulong idx0 = a[0];
 
-        #pragma unroll 8
+        #pragma unroll UNROLL_FACTOR
         for (int i = 0; i < ITERATIONS; ++i) {
             ulong c[2];