Skip to content

Commit

Permalink
V0.9.2 - unique sources for random math
Browse files Browse the repository at this point in the history
Also prevent rotate by 0 in merge
  • Loading branch information
ifdefelse committed Dec 8, 2018
1 parent 60bba1c commit 824cd79
Show file tree
Hide file tree
Showing 4 changed files with 254 additions and 246 deletions.
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,9 @@ void merge(uint32_t &a, uint32_t b, uint32_t r)
{
case 0: a = (a * 33) + b; break;
case 1: a = (a ^ b) * 33; break;
case 2: a = ROTL32(a, ((r >> 16) % 32)) ^ b; break;
case 3: a = ROTR32(a, ((r >> 16) % 32)) ^ b; break;
// prevent rotate by 0 which is a NOP
case 2: a = ROTL32(a, ((r >> 16) % 31) + 1) ^ b; break;
case 3: a = ROTR32(a, ((r >> 16) % 31) + 1) ^ b; break;
}
}
```
Expand Down Expand Up @@ -354,8 +355,11 @@ void progPowLoop(
if (i < PROGPOW_CNT_MATH)
{
// Random Math
int src1 = kiss99(prog_rnd) % PROGPOW_REGS;
int src2 = kiss99(prog_rnd) % PROGPOW_REGS;
// Generate 2 unique sources
int src_rnd = kiss99(prog_rnd) % (PROGPOW_REGS * (PROGPOW_REGS-1));
int src1 = src_rnd % PROGPOW_REGS; // 0 <= src1 < PROGPOW_REGS
int src2 = src_rnd / PROGPOW_REGS; // 0 <= src2 < PROGPOW_REGS - 1
if (src2 >= src1) ++src2; // src2 is now any reg other than src1
int sel1 = kiss99(prog_rnd);
int dst = mix_seq_dst[(mix_seq_dst_cnt++)%PROGPOW_REGS];
int sel2 = kiss99(prog_rnd);
Expand Down Expand Up @@ -396,7 +400,8 @@ A full run showing intermediate values can be seen in [result.log](test/result.l

## Change History

- 0.9.1 (current) - Shuffle what part of the DAG entry each lane accesses, suggested by [mbevand](https://github.com/ifdefelse/ProgPOW/pull/13)
- 0.9.2 (current) - Unique sources for math() and prevent rotation by 0 in merge(). Suggested by [SChernykh](https://github.com/ifdefelse/ProgPOW/issues/19)
- [0.9.1](https://github.com/ifdefelse/ProgPOW/blob/60bba1c3fdad6a54539fc3e9f05727547de9c58c/README.md) - Shuffle what part of the DAG entry each lane accesses. Suggested by [mbevand](https://github.com/ifdefelse/ProgPOW/pull/13)
- [0.9.0](https://github.com/ifdefelse/ProgPOW/blob/a3f62349a1513f0393524683f9671cfe17cca895/README.md) - Unique cache address sources, re-tune parameters
- [0.8.0](https://github.com/ifdefelse/ProgPOW/blob/620b4c7aafe60391f863372814d7517e94386379/README.md) - Original spec

Expand Down
25 changes: 14 additions & 11 deletions libprogpow/ProgPow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,9 @@ std::string ProgPow::getKern(uint64_t block_number, kernel_t kern)
ret << "data_dag = g_dag[offset];\n";
ret << "// hack to prevent compiler from reordering LD and usage\n";
if (kern == KERNEL_CUDA)
ret << "if( hack_false ) __threadfence_block();\n";
ret << "if (hack_false) __threadfence_block();\n";
else
ret << "if( hack_false ) barrier(CLK_LOCAL_MEM_FENCE);\n";
ret << "if (hack_false) barrier(CLK_LOCAL_MEM_FENCE);\n";

for (int i = 0; (i < PROGPOW_CNT_CACHE) || (i < PROGPOW_CNT_MATH); i++)
{
Expand All @@ -159,25 +159,28 @@ std::string ProgPow::getKern(uint64_t block_number, kernel_t kern)
if (i < PROGPOW_CNT_MATH)
{
// Random Math
// A tree combining random input registers together
// reduced to a single result
std::string src1 = mix_src();
std::string src2 = mix_src();
// Generate 2 unique sources
int src_rnd = rnd() % ((PROGPOW_REGS - 1) * PROGPOW_REGS);
int src1 = src_rnd % PROGPOW_REGS; // 0 <= src1 < PROGPOW_REGS
int src2 = src_rnd / PROGPOW_REGS; // 0 <= src2 < PROGPOW_REGS - 1
if (src2 >= src1) ++src2; // src2 is now any reg other than src1
std::string src1_str = "mix[" + std::to_string(src1) + "]";
std::string src2_str = "mix[" + std::to_string(src2) + "]";
uint32_t r1 = rnd();
std::string dest = mix_dst();
uint32_t r2 = rnd();
ret << "// random math " << i << "\n";
ret << math("data", src1, src2, r1);
ret << math("data", src1_str, src2_str, r1);
ret << merge(dest, "data", r2);
}
}
// Consume the global load data at the very end of the loop, to allow fully latency hiding
ret << "// consume global load data\n";
ret << "// hack to prevent compiler from reordering LD and usage\n";
if (kern == KERNEL_CUDA)
ret << "if( hack_false ) __threadfence_block();\n";
ret << "if (hack_false) __threadfence_block();\n";
else
ret << "if( hack_false ) barrier(CLK_LOCAL_MEM_FENCE);\n";
ret << "if (hack_false) barrier(CLK_LOCAL_MEM_FENCE);\n";
ret << merge("mix[0]", "data_dag.s[0]", rnd());
for (int i = 1; i < PROGPOW_DAG_LOADS; i++)
{
Expand All @@ -200,8 +203,8 @@ std::string ProgPow::merge(std::string a, std::string b, uint32_t r)
{
case 0: return a + " = (" + a + " * 33) + " + b + ";\n";
case 1: return a + " = (" + a + " ^ " + b + ") * 33;\n";
case 2: return a + " = ROTL32(" + a + ", " + std::to_string((r >> 16) % 32) + ") ^ " + b + ";\n";
case 3: return a + " = ROTR32(" + a + ", " + std::to_string((r >> 16) % 32) + ") ^ " + b + ";\n";
case 2: return a + " = ROTL32(" + a + ", " + std::to_string(((r >> 16) % 31) + 1) + ") ^ " + b + ";\n";
case 3: return a + " = ROTR32(" + a + ", " + std::to_string(((r >> 16) % 31) + 1) + ") ^ " + b + ";\n";
}
return "#error\n";
}
Expand Down
260 changes: 130 additions & 130 deletions test/kernel.cu
Original file line number Diff line number Diff line change
@@ -1,133 +1,133 @@
// Inner loop for prog_seed 600
__device__ __forceinline__ void progPowLoop(const uint32_t loop,
uint32_t mix[PROGPOW_REGS],
const dag_t *g_dag,
const uint32_t c_dag[PROGPOW_CACHE_WORDS],
const bool hack_false)
uint32_t mix[PROGPOW_REGS],
const dag_t *g_dag,
const uint32_t c_dag[PROGPOW_CACHE_WORDS],
const bool hack_false)
{
dag_t data_dag;
uint32_t offset, data;
const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES-1);
// global load
offset = __shfl_sync(0xFFFFFFFF, mix[0], loop%PROGPOW_LANES, PROGPOW_LANES);
offset %= PROGPOW_DAG_ELEMENTS;
offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;
data_dag = g_dag[offset];
// hack to prevent compiler from reordering LD and usage
if( hack_false ) __threadfence_block();
// cache load 0
offset = mix[26] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[0] = (mix[0] ^ data) * 33;
// random math 0
data = clz(mix[10]) + clz(mix[5]);
mix[4] = ROTR32(mix[4], 1) ^ data;
// cache load 1
offset = mix[30] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[27] = (mix[27] * 33) + data;
// random math 1
data = min(mix[24], mix[16]);
mix[26] = (mix[26] ^ data) * 33;
// cache load 2
offset = mix[1] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[13] = ROTL32(mix[13], 4) ^ data;
// random math 2
data = popcount(mix[3]) + popcount(mix[25]);
mix[15] = ROTR32(mix[15], 14) ^ data;
// cache load 3
offset = mix[19] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[17] = (mix[17] ^ data) * 33;
// random math 3
data = mix[13] * mix[23];
mix[7] = (mix[7] * 33) + data;
// cache load 4
offset = mix[11] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[14] = (mix[14] ^ data) * 33;
// random math 4
data = mix[30] ^ mix[17];
mix[8] = ROTR32(mix[8], 18) ^ data;
// cache load 5
offset = mix[21] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[9] = (mix[9] * 33) + data;
// random math 5
data = mix[0] | mix[20];
mix[12] = (mix[12] * 33) + data;
// cache load 6
offset = mix[15] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[3] = (mix[3] ^ data) * 33;
// random math 6
data = mix[15] | mix[29];
mix[10] = ROTR32(mix[10], 14) ^ data;
// cache load 7
offset = mix[18] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[1] = (mix[1] ^ data) * 33;
// random math 7
data = ROTL32(mix[30], mix[15]);
mix[11] = (mix[11] * 33) + data;
// cache load 8
offset = mix[3] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[6] = (mix[6] * 33) + data;
// random math 8
data = mix[11] + mix[9];
mix[16] = ROTR32(mix[16], 4) ^ data;
// cache load 9
offset = mix[17] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[28] = ROTL32(mix[28], 21) ^ data;
// random math 9
data = mix[26] + mix[7];
mix[31] = (mix[31] ^ data) * 33;
// cache load 10
offset = mix[31] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[2] = ROTR32(mix[2], 21) ^ data;
// random math 10
data = min(mix[11], mix[1]);
mix[19] = ROTR32(mix[19], 13) ^ data;
// cache load 11
offset = mix[16] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[30] = ROTR32(mix[30], 7) ^ data;
// random math 11
data = mix[10] & mix[14];
mix[22] = (mix[22] * 33) + data;
// random math 12
data = mix[28] ^ mix[30];
mix[29] = ROTR32(mix[29], 28) ^ data;
// random math 13
data = min(mix[17], mix[17]);
mix[5] = (mix[5] ^ data) * 33;
// random math 14
data = mix[23] + mix[25];
mix[24] = ROTL32(mix[24], 6) ^ data;
// random math 15
data = mix[1] & mix[27];
mix[18] = (mix[18] ^ data) * 33;
// random math 16
data = ROTL32(mix[3], mix[16]);
mix[25] = (mix[25] * 33) + data;
// random math 17
data = ROTL32(mix[0], mix[9]);
mix[23] = (mix[23] * 33) + data;
// random math 18
data = mix[2] ^ mix[0];
mix[21] = (mix[21] * 33) + data;
// random math 19
data = mix[25] * mix[10];
mix[20] = (mix[20] ^ data) * 33;
// consume global load data
// hack to prevent compiler from reordering LD and usage
if( hack_false ) __threadfence_block();
mix[0] = (mix[0] ^ data_dag.s[0]) * 33;
mix[0] = (mix[0] * 33) + data_dag.s[1];
mix[4] = ROTR32(mix[4], 1) ^ data_dag.s[2];
mix[27] = ROTL32(mix[27], 31) ^ data_dag.s[3];
}
dag_t data_dag;
uint32_t offset, data;
const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES - 1);
// global load
offset = __shfl_sync(0xFFFFFFFF, mix[0], loop%PROGPOW_LANES, PROGPOW_LANES);
offset %= PROGPOW_DAG_ELEMENTS;
offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;
data_dag = g_dag[offset];
// hack to prevent compiler from reordering LD and usage
if (hack_false) __threadfence_block();
// cache load 0
offset = mix[26] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[0] = (mix[0] ^ data) * 33;
// random math 0
data = mix[10] ^ mix[16];
mix[4] = ROTL32(mix[4], 27) ^ data;
// cache load 1
offset = mix[30] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[27] = ROTR32(mix[27], 7) ^ data;
// random math 1
data = mix[24] & mix[14];
mix[26] = (mix[26] * 33) + data;
// cache load 2
offset = mix[1] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[13] = (mix[13] * 33) + data;
// random math 2
data = mix[17] & mix[16];
mix[15] = ROTR32(mix[15], 12) ^ data;
// cache load 3
offset = mix[19] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[17] = (mix[17] ^ data) * 33;
// random math 3
data = mul_hi(mix[31], mix[5]);
mix[7] = (mix[7] ^ data) * 33;
// cache load 4
offset = mix[11] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[14] = (mix[14] ^ data) * 33;
// random math 4
data = mix[23] * mix[19];
mix[8] = (mix[8] * 33) + data;
// cache load 5
offset = mix[21] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[9] = (mix[9] ^ data) * 33;
// random math 5
data = clz(mix[30]) + clz(mix[15]);
mix[12] = ROTR32(mix[12], 16) ^ data;
// cache load 6
offset = mix[15] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[3] = ROTR32(mix[3], 27) ^ data;
// random math 6
data = clz(mix[12]) + clz(mix[5]);
mix[10] = (mix[10] * 33) + data;
// cache load 7
offset = mix[18] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[1] = ROTR32(mix[1], 6) ^ data;
// random math 7
data = min(mix[4], mix[25]);
mix[11] = ROTR32(mix[11], 27) ^ data;
// cache load 8
offset = mix[3] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[6] = (mix[6] ^ data) * 33;
// random math 8
data = mul_hi(mix[18], mix[16]);
mix[16] = (mix[16] ^ data) * 33;
// cache load 9
offset = mix[17] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[28] = ROTL32(mix[28], 17) ^ data;
// random math 9
data = ROTL32(mix[15], mix[23]);
mix[31] = (mix[31] * 33) + data;
// cache load 10
offset = mix[31] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[2] = (mix[2] * 33) + data;
// random math 10
data = mix[11] | mix[17];
mix[19] = ROTL32(mix[19], 28) ^ data;
// cache load 11
offset = mix[16] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[30] = ROTR32(mix[30], 18) ^ data;
// random math 11
data = mix[22] * mix[7];
mix[22] = ROTR32(mix[22], 30) ^ data;
// random math 12
data = mix[27] & mix[16];
mix[29] = ROTR32(mix[29], 25) ^ data;
// random math 13
data = ROTL32(mix[11], mix[0]);
mix[5] = (mix[5] ^ data) * 33;
// random math 14
data = ROTR32(mix[15], mix[25]);
mix[24] = ROTL32(mix[24], 13) ^ data;
// random math 15
data = mix[14] & mix[26];
mix[18] = (mix[18] * 33) + data;
// random math 16
data = mix[28] * mix[16];
mix[25] = (mix[25] ^ data) * 33;
// random math 17
data = mix[11] * mix[0];
mix[23] = (mix[23] ^ data) * 33;
// random math 18
data = mix[2] + mix[24];
mix[21] = ROTR32(mix[21], 20) ^ data;
// random math 19
data = mix[25] + mix[4];
mix[20] = ROTL32(mix[20], 22) ^ data;
// consume global load data
// hack to prevent compiler from reordering LD and usage
if (hack_false) __threadfence_block();
mix[0] = (mix[0] ^ data_dag.s[0]) * 33;
mix[0] = ROTR32(mix[0], 21) ^ data_dag.s[1];
mix[4] = (mix[4] * 33) + data_dag.s[2];
mix[27] = (mix[27] ^ data_dag.s[3]) * 33;
}
Loading

0 comments on commit 824cd79

Please sign in to comment.