Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cryptonight variant 2 support #160

Merged
merged 3 commits into from
Sep 15, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/amd/GpuContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct GpuContext
stridedIndex(1),
memChunk(2),
compMode(1),
unrollFactor(8),
DeviceID(nullptr),
CommandQueues(nullptr),
InputBuffer(nullptr),
Expand All @@ -58,13 +59,14 @@ struct GpuContext
{}


inline GpuContext(size_t index, size_t intensity, size_t worksize, int stridedIndex, int memChunk, bool compMode) :
inline GpuContext(size_t index, size_t intensity, size_t worksize, int stridedIndex, int memChunk, bool compMode, int unrollFactor) :
deviceIdx(index),
rawIntensity(intensity),
workSize(worksize),
stridedIndex(stridedIndex),
memChunk(memChunk),
compMode(compMode ? 1 : 0),
unrollFactor(unrollFactor),
DeviceID(nullptr),
CommandQueues(nullptr),
InputBuffer(nullptr),
Expand All @@ -84,6 +86,7 @@ struct GpuContext
int stridedIndex;
int memChunk;
int compMode;
int unrollFactor;

/*Output vars*/
cl_device_id DeviceID;
Expand All @@ -92,7 +95,7 @@ struct GpuContext
cl_mem OutputBuffer;
cl_mem ExtraBuffers[6];
cl_program Program;
cl_kernel Kernels[11];
cl_kernel Kernels[12];
size_t freeMem;
int computeUnits;
std::string name;
Expand Down
10 changes: 8 additions & 2 deletions src/amd/OclCLI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ bool OclCLI::setup(std::vector<xmrig::IThread *> &threads)
}

for (size_t i = 0; i < m_devices.size(); i++) {
threads.push_back(new OclThread(m_devices[i], intensity(i), worksize(i), affinity(i)));
threads.push_back(new OclThread(m_devices[i], intensity(i), worksize(i), affinity(i), unrollFactor(i)));
}

return true;
Expand Down Expand Up @@ -136,13 +136,19 @@ void OclCLI::parseLaunch(const char *arg)
else if (count == 2) {
m_worksize.push_back(v > 0 ? v : 8);
}
else if (count == 3) {
m_unrollFactor.push_back(v > 0 ? v : 8);
}

pch = strtok(nullptr, "x");
}

if (count == 1) {
if (count < 2) {
m_worksize.push_back(8);
}
if (count < 3) {
m_unrollFactor.push_back(8);
}
}

free(value);
Expand Down
2 changes: 2 additions & 0 deletions src/amd/OclCLI.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class OclCLI
inline int affinity(int index) const { return get(m_affinity, index, -1); }
inline int intensity(int index) const { return get(m_intensity, index, 0); }
inline int worksize(int index) const { return get(m_worksize, index, 8); }
inline int unrollFactor(int index) const { return get(m_unrollFactor, index, 8); }

int get(const std::vector<int> &vector, int index, int defaultValue) const;
void parse(std::vector<int> &vector, const char *arg) const;
Expand All @@ -65,6 +66,7 @@ class OclCLI
std::vector<int> m_devices;
std::vector<int> m_intensity;
std::vector<int> m_worksize;
std::vector<int> m_unrollFactor;
};


Expand Down
6 changes: 4 additions & 2 deletions src/amd/OclCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,20 @@ OclCache::OclCache(int index, cl_context opencl_ctx, GpuContext *ctx, const char
bool OclCache::load()
{
const xmrig::Algo algo = m_config->algorithm().algo();

const int64_t timeStart = xmrig::currentMSecsSinceEpoch();

char options[512] = { 0 };
snprintf(options, sizeof(options), "-DITERATIONS=%u -DMASK=%u -DWORKSIZE=%zu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%zu -DALGO=%d",
snprintf(options, sizeof(options), "-DITERATIONS=%u -DMASK=%u -DWORKSIZE=%zu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%zu -DALGO=%d -DUNROLL_FACTOR=%d",
xmrig::cn_select_iter(algo, xmrig::VARIANT_0),
xmrig::cn_select_mask(algo),
m_ctx->workSize,
m_ctx->stridedIndex,
static_cast<int>(1u << m_ctx->memChunk),
m_ctx->compMode,
xmrig::cn_select_memory(algo),
static_cast<int>(algo)
static_cast<int>(algo),
m_ctx->unrollFactor
);

if (!prepare(options)) {
Expand Down
25 changes: 16 additions & 9 deletions src/amd/OclGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ inline static int cnKernelOffset(uint32_t variant)
case xmrig::VARIANT_TUBE:
return 10;

case xmrig::VARIANT_2:
return 11;

default:
break;
}
Expand All @@ -128,8 +131,8 @@ size_t InitOpenCLGpu(int index, cl_context opencl_ctx, GpuContext* ctx, const ch
getDeviceName(ctx->DeviceID, buf, sizeof(buf));
ctx->computeUnits = getDeviceMaxComputeUnits(ctx->DeviceID);

LOG_INFO(config->isColors() ? "\x1B[01;37m#%d\x1B[0m, GPU \x1B[01;37m#%zu\x1B[0m \x1B[01;32m%s\x1B[0m, intensity: \x1B[01;37m%zu\x1B[0m (%zu/%zu), cu: \x1B[01;37m%d" : "#%d, GPU #%zu (%s), intensity: %zu (%zu/%zu), cu: %d",
index, ctx->deviceIdx, buf, ctx->rawIntensity, ctx->workSize, MaximumWorkSize, ctx->computeUnits);
LOG_INFO(config->isColors() ? "\x1B[01;37m#%d\x1B[0m, GPU \x1B[01;37m#%zu\x1B[0m \x1B[01;32m%s\x1B[0m, intensity: \x1B[01;37m%zu\x1B[0m (%zu/%zu), unroll: \x1B[01;37m%d, cu: \x1B[01;37m%d" : "#%d, GPU #%zu (%s), intensity: %zu (%zu/%zu), unroll: %d, cu: %d",
index, ctx->deviceIdx, buf, ctx->rawIntensity, ctx->workSize, MaximumWorkSize, ctx->unrollFactor, ctx->computeUnits);

ctx->CommandQueues = OclLib::createCommandQueue(opencl_ctx, ctx->DeviceID, &ret);
if (ret != CL_SUCCESS) {
Expand Down Expand Up @@ -195,8 +198,8 @@ size_t InitOpenCLGpu(int index, cl_context opencl_ctx, GpuContext* ctx, const ch
return OCL_ERR_API;
}

const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero", "cn1_msr", "cn1_xao", "cn1_tube"};
for (int i = 0; i < 11; ++i) {
const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero", "cn1_msr", "cn1_xao", "cn1_tube", "cn1_v2_monero"};
for (int i = 0; i < 12; ++i) {
ctx->Kernels[i] = OclLib::createKernel(ctx->Program, KernelNames[i], &ret);
if (ret != CL_SUCCESS) {
return OCL_ERR_API;
Expand Down Expand Up @@ -401,13 +404,17 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, xmrig::Config *config)
const char *wolfSkeinCL =
#include "./opencl/wolf-skein.cl"
;
const char *fastIntMathV2CL =
#include "./opencl/fast_int_math_v2.cl"
;

std::string source_code(cryptonightCL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_AES"), wolfAesCL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_JH"), jhCL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_BLAKE256"), blake256CL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_GROESTL256"), groestl256CL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_AES"), wolfAesCL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_JH"), jhCL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_BLAKE256"), blake256CL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_GROESTL256"), groestl256CL);
source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);

for (size_t i = 0; i < num_gpus; ++i) {
if (ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) {
Expand Down
130 changes: 127 additions & 3 deletions src/amd/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ XMRIG_INCLUDE_JH
XMRIG_INCLUDE_BLAKE256
//#include "opencl/groestl256.cl"
XMRIG_INCLUDE_GROESTL256
//#include "fast_int_math_v2.cl"
XMRIG_INCLUDE_FAST_INT_MATH_V2


#define VARIANT_0 0 // Original CryptoNight or CryptoNight-Heavy
Expand Down Expand Up @@ -580,7 +582,7 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
if (gIdx < Threads)
# endif
{
#pragma unroll 8
#pragma unroll UNROLL_FACTOR
for (int i = 0; i < ITERATIONS; ++i) {
ulong c[2];

Expand Down Expand Up @@ -615,6 +617,128 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
}


)==="
R"===(

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_v2_monero(__global uint4 *Scratchpad, __global ulong *states, ulong Threads, uint variant, __global ulong *input)
{
ulong a[2], b[4];
__local uint AES0[256], AES1[256], AES2[256], AES3[256], RCP[256];

const ulong gIdx = getIdx();

for(int i = get_local_id(0); i < 256; i += WORKSIZE)
{
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
AES2[i] = rotate(tmp, 16U);
AES3[i] = rotate(tmp, 24U);
RCP[i] = RCP_C[i];
}

barrier(CLK_LOCAL_MEM_FENCE);

# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
Scratchpad += gIdx;
# elif (STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif

a[0] = states[0] ^ states[4];
a[1] = states[1] ^ states[5];

b[0] = states[2] ^ states[6];
b[1] = states[3] ^ states[7];
b[2] = states[8] ^ states[10];
b[3] = states[9] ^ states[11];
}

ulong2 bx0 = ((ulong2 *)b)[0];
ulong2 bx1 = ((ulong2 *)b)[1];

mem_fence(CLK_LOCAL_MEM_FENCE);

#if (STRIDED_INDEX == 0)
#define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4))))
#elif (STRIDED_INDEX == 1)
#define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4)) * as_uint2(Threads).s0))
#elif (STRIDED_INDEX == 2)
#define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (((idx ^ (N << 4)) % (MEM_CHUNK << 4)) + ((idx ^ (N << 4)) / (MEM_CHUNK << 4)) * WORKSIZE * (MEM_CHUNK << 4))))
#endif

# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
uint2 division_result = as_uint2(states[12]);
uint sqrt_result = as_uint2(states[13]).s0;

#pragma unroll UNROLL_FACTOR
for(int i = 0; i < ITERATIONS; ++i)
{
uint idx = a[0] & MASK;
uint4 c = SCRATCHPAD_CHUNK(0);
c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);

{
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));

SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}

SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c;

idx = as_ulong2(c).s0 & MASK;
uint4 tmp = SCRATCHPAD_CHUNK(0);

{
tmp.s0 ^= division_result.s0;
tmp.s1 ^= division_result.s1 ^ sqrt_result;

division_result = fast_div_v2((__local uchar *) RCP, as_ulong2(c).s1, (c.s0 + (sqrt_result << 1)) | 0x80000001UL);
sqrt_result = fast_sqrt_v2(as_ulong2(c).s0 + as_ulong(division_result));
}

{
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));

SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}

a[1] += as_ulong2(c).s0 * as_ulong2(tmp).s0;
a[0] += mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0);

SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];

((uint4 *)a)[0] ^= tmp;
bx1 = bx0;
bx0 = as_ulong2(c);
}

#undef SCRATCHPAD_CHUNK
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}

)==="
R"===(

Expand Down Expand Up @@ -757,7 +881,7 @@ __kernel void cn1_tube(__global uint4 *Scratchpad, __global ulong *states, ulong
{
ulong idx0 = a[0];

#pragma unroll 8
#pragma unroll UNROLL_FACTOR
for (int i = 0; i < ITERATIONS; ++i) {
ulong c[2];

Expand Down Expand Up @@ -852,7 +976,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
{
ulong idx0 = a[0];

#pragma unroll 8
#pragma unroll UNROLL_FACTOR
for (int i = 0; i < ITERATIONS; ++i) {
ulong c[2];

Expand Down
Loading