From 706225bf40b7de97549fc6be8cda69278ff230de Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Thu, 5 Mar 2020 19:15:39 +0100 Subject: [PATCH 01/19] Ignore CMakeSettings.json from Visual Studio --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0d69d66d2..07f699fbd 100644 --- a/.gitignore +++ b/.gitignore @@ -75,3 +75,4 @@ node_modules/ # vscode .vscode/ /.vs +CMakeSettings.json From bee4dc18ef8f841ce2104e18afe0c6d3070727ea Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Thu, 5 Mar 2020 19:16:22 +0100 Subject: [PATCH 02/19] Some white spaces removal --- cmake/Hunter/config.cmake | 3 ++- libethash-cuda/CUDAMiner.cpp | 18 +++++++++--------- libethash-cuda/CUDAMiner_kernel.cu | 2 +- libprogpow/ProgPow.cpp | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/cmake/Hunter/config.cmake b/cmake/Hunter/config.cmake index 418c3cc35..74167d94c 100644 --- a/cmake/Hunter/config.cmake +++ b/cmake/Hunter/config.cmake @@ -1,2 +1,3 @@ hunter_config(CURL VERSION ${HUNTER_CURL_VERSION} CMAKE_ARGS HTTP_ONLY=ON CMAKE_USE_OPENSSL=OFF CMAKE_USE_LIBSSH2=OFF) -hunter_config(libjson-rpc-cpp VERSION ${HUNTER_libjson-rpc-cpp_VERSION} CMAKE_ARGS TCP_SOCKET_SERVER=ON) \ No newline at end of file +hunter_config(libjson-rpc-cpp VERSION ${HUNTER_libjson-rpc-cpp_VERSION} CMAKE_ARGS TCP_SOCKET_SERVER=ON) +hunter_config(Boost VERSION 1.70.0-p0) \ No newline at end of file diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp index 59e837bfb..e682ab5b8 100644 --- a/libethash-cuda/CUDAMiner.cpp +++ b/libethash-cuda/CUDAMiner.cpp @@ -74,7 +74,7 @@ bool CUDAMiner::init(int epoch) cuda_init(getNumDevices(), light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), s_dagInHostMemory, s_dagCreateDevice); s_dagLoadIndex++; - + if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE) { if (s_dagLoadIndex >= s_numInstances && s_dagInHostMemory) @@ -369,7 +369,7 @@ bool CUDAMiner::cuda_init( cudalog << "CUDA device " << string(device_props.name) << " has insufficient GPU memory." << device_props.totalGlobalMem << " bytes of memory found < " << dagBytes << " bytes of memory required"; return false; } - //We need to reset the device and recreate the dag + //We need to reset the device and recreate the dag cudalog << "Resetting device"; CUDA_SAFE_CALL(cudaDeviceReset()); CUdevice device; @@ -379,23 +379,23 @@ bool CUDAMiner::cuda_init( //We need to reset the light and the Dag for the following code to reallocate //since cudaDeviceReset() frees all previous allocated memory m_light[m_device_num] = nullptr; - m_dag = nullptr; + m_dag = nullptr; } // create buffer for cache hash64_t * dag = m_dag; hash64_t * light = m_light[m_device_num]; - if(!light){ + if(!light){ cudalog << "Allocating light with size: " << _lightBytes; CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast(&light), _lightBytes)); } // copy lightData to device CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast(light), _lightData, _lightBytes, cudaMemcpyHostToDevice)); m_light[m_device_num] = light; - + if(dagElms != m_dag_elms || !dag) // create buffer for dag CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast(&dag), dagBytes)); - + if(dagElms != m_dag_elms || !dag) { // create mining buffers @@ -405,7 +405,7 @@ bool CUDAMiner::cuda_init( CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], sizeof(search_results))); CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i])); } - + memset(&m_current_header, 0, sizeof(hash32_t)); m_current_target = 0; m_current_nonce = 0; @@ -429,7 +429,7 @@ bool CUDAMiner::cuda_init( } }else{ while(!hostDAG) - this_thread::sleep_for(chrono::milliseconds(100)); + this_thread::sleep_for(chrono::milliseconds(100)); goto cpyDag; } } @@ -441,7 +441,7 @@ bool CUDAMiner::cuda_init( CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast(dag), hdag, dagBytes, cudaMemcpyHostToDevice)); } } - + m_dag = dag; m_dag_elms = dagElms; diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index 3f7666d29..e585faf50 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -139,7 +139,7 @@ __device__ __forceinline__ void fill_mix(uint64_t seed, uint32_t lane_id, uint32 mix[i] = kiss99(st); } -__global__ void +__global__ void progpow_search( uint64_t start_nonce, const hash32_t header, diff --git a/libprogpow/ProgPow.cpp b/libprogpow/ProgPow.cpp index 1907c5a30..afea9c4db 100644 --- a/libprogpow/ProgPow.cpp +++ b/libprogpow/ProgPow.cpp @@ -159,7 +159,7 @@ std::string ProgPow::getKern(uint64_t block_number, kernel_t kern) if (i < PROGPOW_CNT_MATH) { // Random Math - // Generate 2 unique sources + // Generate 2 unique sources int src_rnd = rnd() % ((PROGPOW_REGS - 1) * PROGPOW_REGS); int src1 = src_rnd % PROGPOW_REGS; // 0 <= src1 < PROGPOW_REGS int src2 = src_rnd / PROGPOW_REGS; // 0 <= src2 < PROGPOW_REGS - 1 From d45cd96f01faab544f2aeb09ff6d443b1e5c8d31 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Thu, 5 Mar 2020 19:16:44 +0100 Subject: [PATCH 03/19] Upgrade Hunter release --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a6dc05f2..2e6b37291 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,9 +13,9 @@ cable_configure_toolchain(DEFAULT cxx11) set(HUNTER_CONFIGURATION_TYPES Release) HunterGate( - URL "https://github.com/ruslo/hunter/archive/v0.20.34.tar.gz" - SHA1 "2f04d1beffdf39db1c40d8347beb8c10bbe9b8ed" - LOCAL + URL "https://github.com/ruslo/hunter/archive/v0.23.197.tar.gz" + SHA1 "f494a08bc9bb489527be1240d223d3ff69ece322" + LOCAL ) project(ethminer) From cbd1c160031ce8d2d3e75493ff8f341fa42bbf14 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Thu, 5 Mar 2020 19:17:18 +0100 Subject: [PATCH 04/19] Apply knobs for spec 0.9.3 --- libprogpow/ProgPow.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/libprogpow/ProgPow.h b/libprogpow/ProgPow.h index ab6fee3bb..c20502254 100644 --- a/libprogpow/ProgPow.h +++ b/libprogpow/ProgPow.h @@ -4,7 +4,8 @@ #include // blocks before changing the random program -#define PROGPOW_PERIOD 50 +//#define PROGPOW_PERIOD 50 +#define PROGPOW_PERIOD 10 // lanes that work together calculating a hash #define PROGPOW_LANES 16 // uint32 registers per lane @@ -16,9 +17,11 @@ // DAG accesses, also the number of loops executed #define PROGPOW_CNT_DAG 64 // random cache accesses per loop -#define PROGPOW_CNT_CACHE 12 +//#define PROGPOW_CNT_CACHE 12 +#define PROGPOW_CNT_CACHE 11 // random math instructions per loop -#define PROGPOW_CNT_MATH 20 +//#define PROGPOW_CNT_MATH 20 +#define PROGPOW_CNT_MATH 18 class ProgPow { From 24225d2b6aab2f23f563d14dd0b2141204793dae Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Fri, 6 Mar 2020 10:55:49 +0100 Subject: [PATCH 05/19] Amend Readme with current implmentation of fill_mix --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7a1f6eccd..d35967f8f 100644 --- a/README.md +++ b/README.md @@ -196,13 +196,14 @@ void fill_mix( { // Use FNV to expand the per-warp seed to per-lane // Use KISS to expand the per-lane seed to fill mix + uint32_t fnv_hash = FNV_OFFSET_BASIS; kiss99_t st; - st.z = fnv1a(FNV_OFFSET_BASIS, seed); - st.w = fnv1a(st.z, seed >> 32); - st.jsr = fnv1a(st.w, lane_id); - st.jcong = fnv1a(st.jsr, lane_id); + st.z = fnv1a(fnv_hash, seed); + st.w = fnv1a(fnv_hash, seed >> 32); + st.jsr = fnv1a(fnv_hash, lane_id); + st.jcong = fnv1a(fnv_hash, lane_id); for (int i = 0; i < PROGPOW_REGS; i++) - mix[i] = kiss99(st); + mix[i] = kiss99(st); } ``` From e18d5d4d2a9bfdcf47f9497d74b19696850d7fce Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Fri, 6 Mar 2020 12:31:20 +0100 Subject: [PATCH 06/19] [CUDA] - Remove Issue-64 Rationale The problem highlighted by Kik revealed a vulnerable condition which allows the mining activity to be carried out bypassing, almost completely, the characteristics of memory-hardness embedded into the algo. Must be evidenced that this issue could be exploited only in bundle with a custom node implementation and the current "public" mining infrastructure does not allow the threat vector to go through. Nevertheless the concern is present and the code provided by Kik has proven real. Kik's discovery relies on two basic assumptions : - each block's header_hash is modifiable along with the providing of an according value for field "extraData" - due to above mining can be carried out bypassing, almost completely, DAG memory accesses and "transforming" in a "goal seeking" only on header_hash using high Keccak computational power. As additional consideration a "custom node" must be modifed to accept a mining result from work consumer with a modified header_hash (see below) The two above assumptions are valid for **both ethash and progpow** algorithms even if the latter, due to different width of "seed" is way more affected. Both ethash and progpow share the same "macro" steps with a difference on step 3: - input data (header_hash + nonce) go through a Keccak round which produces a "seed" (256 bits for ethash, 64 bits for ProgPoW) - "seed" drives the access to DAG memory locations which eventually produce a "mix_hash" (256 bits for both algos) - In **ethash** last round of Keccak uses as input : 16 words as carry-over from previous keccak round + 8 words mix_hash; in **progpow** the input is 8 words from header_hash + "seed" + 8 words from mix_hash The logic concatenation of operations allows **in ProgPoW** (Kik's code) to: 1 skip the first step and directly set an arbitrary value for "seed" 2 obtain mix_hash (doing only one round of memory accesses) 3 linearly iterate through "extraData" increments and build a `new_header_hash` (Keccak256) starting from the header_hash sent by work provider 4 Apply new_header_hash + seed + mix as input to last keccak round and compare first two words of digest result to target. If matching go ahead otherwise goto 3 5 We now have a new_header_hash and an "extraData" solving the algo. One thing left to do is to find a **nonce** which, combined with new_header in the initial Keccak round, produces a seed (and by consequence a mix_hash) which is equal to the one arbitrary set 6 To do so simply scan nonce range (2**64) and do as many Keccak rounds as possible to find the right one. Should range be exhausted goto 4 7 Eventually the miner will notify the "custom node" that the hashing has found a solution (nonce) changing header_hash due to extraData. The difference in width for the seed and the different absorb phase for last keccak round makes the issue orders of magnitude more relevant in ProgPoW than in ethash but nevertheless the same pattern apply. It goes without saying this method is highly impractical and economically unviable on ethash as of tiday ... but is there. Basically in both algos we have : - seed == F(header_hash, nonce) - mix == F(seed) // where memory access occurr - final == F(seed, mix) This means also that setting an arbitrary seed we get - mix == F(seed) // where memory access occurr - final == F(seed, mix) // where I find a "new_header" - nonce == F(new_header, seed) // which can be iterated very quickly Thus making possible to brute force keccak while bypassing memory accesses Purpose of this PR is to extend the dependency from header_hash to mix too so having - seed == F(header_hash, nonce) - mix == F(header_hash) // where memory access occurr - final == F(seed, mix) Thus having **both** mix and seed being dependant on header_hash thus making impossible to goal seek a seed without having to also change the mix. --- libethash-cuda/CUDAMiner_kernel.cu | 80 ++++++++++++++++++------------ 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index e585faf50..3a7d106bb 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -79,28 +79,18 @@ __device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x) // Keccak - implemented as a variant of SHAKE // The width is 800, with a bitrate of 576, a capacity of 224, and no padding -// Only need 64 bits of output for mining -__device__ __noinline__ uint64_t keccak_f800(hash32_t header, uint64_t seed, hash32_t digest) +__device__ __noinline__ void keccak_f800(uint32_t* st) { - uint32_t st[25]; - for (int i = 0; i < 25; i++) - st[i] = 0; - for (int i = 0; i < 8; i++) - st[i] = header.uint32s[i]; - st[8] = seed; - st[9] = seed >> 32; - for (int i = 0; i < 8; i++) - st[10+i] = digest.uint32s[i]; + // Assumes input state has already been filled + // at higher level - for (int r = 0; r < 21; r++) { + // Complete all 22 rounds as a separate impl to + // evaluate only first 8 words is wasteful of regsters + for (int r = 0; r < 22; r++) { keccak_f800_round(st, r); } - // last round can be simplified due to partial output - keccak_f800_round(st, 21); - // Byte swap so byte 0 of hash is MSB of result - return (uint64_t)cuda_swab32(st[0]) << 32 | cuda_swab32(st[1]); } #define fnv1a(h, d) (h = (uint32_t(h) ^ uint32_t(d)) * uint32_t(0x1000193)) @@ -124,14 +114,14 @@ __device__ __forceinline__ uint32_t kiss99(kiss99_t &st) return ((MWC^st.jcong) + st.jsr); } -__device__ __forceinline__ void fill_mix(uint64_t seed, uint32_t lane_id, uint32_t mix[PROGPOW_REGS]) +__device__ __forceinline__ void fill_mix(const hash32_t* header, uint32_t lane_id, uint32_t mix[PROGPOW_REGS]) { // Use FNV to expand the per-warp seed to per-lane // Use KISS to expand the per-lane seed to fill mix uint32_t fnv_hash = 0x811c9dc5; kiss99_t st; - st.z = fnv1a(fnv_hash, seed); - st.w = fnv1a(fnv_hash, seed >> 32); + st.z = fnv1a(fnv_hash, header.uint32s[0]); + st.w = fnv1a(fnv_hash, header.uint32s[1]); st.jsr = fnv1a(fnv_hash, lane_id); st.jcong = fnv1a(fnv_hash, lane_id); #pragma unroll @@ -155,7 +145,7 @@ progpow_search( const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES - 1); - // Load the first portion of the DAG into the cache + // Load the first portion of the DAG into the shared cache for (uint32_t word = threadIdx.x*PROGPOW_DAG_LOADS; word < PROGPOW_CACHE_WORDS; word += blockDim.x*PROGPOW_DAG_LOADS) { dag_t load = g_dag[word/PROGPOW_DAG_LOADS]; @@ -163,23 +153,35 @@ progpow_search( c_dag[word + i] = load.s[i]; } - hash32_t digest; - for (int i = 0; i < 8; i++) - digest.uint32s[i] = 0; - // keccak(header..nonce) - uint64_t seed = keccak_f800(header, nonce, digest); - + // Force threads to sync and ensure shared mem is in sync __syncthreads(); + uint32_t state[25]; // Keccak's state + hash32_t digest; // Carry-over from keccak's output + + // Absorb phase for initial round of keccak + // 1st fill with header data (8 words) + for (int i = 0; i < 8; i++) + state[i] = header.uint32s[i]; + // 2nd fill with nonce (2 words) + state[8] = nonce; + state[9] = nonce >> 32; + // 3rd all remaining elements to zero + for (int i = 10; i < 25; i++) + state[i] = 0; + + // Run intial keccak round + keccak_f800(&state); + + // Main loop #pragma unroll 1 for (uint32_t h = 0; h < PROGPOW_LANES; h++) { uint32_t mix[PROGPOW_REGS]; - // share the hash's seed across all lanes - uint64_t hash_seed = __shfl_sync(0xFFFFFFFF, seed, h, PROGPOW_LANES); - // initialize mix for all lanes - fill_mix(hash_seed, lane_id, mix); + // initialize mix for all lanes using first + // two words from header_hash + fill_mix(header, lane_id, mix); #pragma unroll 1 for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++) @@ -207,8 +209,24 @@ progpow_search( digest = digest_temp; } + + // Absorb phase for last round of keccak (256 bits) + // 1st initial 8 words of state are kept as carry-over from initial keccak + // 2nd subsequent 8 words are carried from digest/mix + for (int i = 8; i < 16; i++) + state[i] = digest.uint32s[i]; + // 3rd all other elements to zero + for (int i = 16; i < 25; i++) + state[i] = 0; + + // Run keccak loop + keccak_f800(&state); + + // Extract result, swap endianness, and compare with target + uint64_t result = (uint64_t)cuda_swab32(state[0]) << 32 | cuda_swab32(state[1]); + // keccak(header .. keccak(header..nonce) .. digest); - if (keccak_f800(header, seed, digest) >= target) + if (result >= target) return; uint32_t index = atomicInc((uint32_t *)&g_output->count, 0xffffffff); From d5d206809d82175e6cdf88116f6903f6f2ca5356 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Fri, 6 Mar 2020 14:56:08 +0100 Subject: [PATCH 07/19] Missing inclusion of digest into fill_mix plus amend compile errors --- libethash-cuda/CUDAMiner_kernel.cu | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index 3a7d106bb..5445b5bec 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -114,7 +114,7 @@ __device__ __forceinline__ uint32_t kiss99(kiss99_t &st) return ((MWC^st.jcong) + st.jsr); } -__device__ __forceinline__ void fill_mix(const hash32_t* header, uint32_t lane_id, uint32_t mix[PROGPOW_REGS]) +__device__ __forceinline__ void fill_mix(const hash32_t header, uint64_t hash_seed, uint32_t lane_id, uint32_t mix[PROGPOW_REGS]) { // Use FNV to expand the per-warp seed to per-lane // Use KISS to expand the per-lane seed to fill mix @@ -122,8 +122,8 @@ __device__ __forceinline__ void fill_mix(const hash32_t* header, uint32_t lane_i kiss99_t st; st.z = fnv1a(fnv_hash, header.uint32s[0]); st.w = fnv1a(fnv_hash, header.uint32s[1]); - st.jsr = fnv1a(fnv_hash, lane_id); - st.jcong = fnv1a(fnv_hash, lane_id); + st.jsr = fnv1a(fnv_hash, ROTL32(hash_seed, lane_id)); + st.jcong = fnv1a(fnv_hash, ROTL32(hash_seed >> 32, lane_id)); #pragma unroll for (int i = 0; i < PROGPOW_REGS; i++) mix[i] = kiss99(st); @@ -157,7 +157,7 @@ progpow_search( __syncthreads(); uint32_t state[25]; // Keccak's state - hash32_t digest; // Carry-over from keccak's output + hash32_t digest; // Carry-over from mix output // Absorb phase for initial round of keccak // 1st fill with header data (8 words) @@ -171,7 +171,7 @@ progpow_search( state[i] = 0; // Run intial keccak round - keccak_f800(&state); + keccak_f800(state); // Main loop #pragma unroll 1 @@ -179,9 +179,13 @@ progpow_search( { uint32_t mix[PROGPOW_REGS]; + // share the first two words of digest across all lanes + uint64_t hash_seed = (uint64_t)state[0] << 32 | state[1]; + hash_seed = __shfl_sync(0xFFFFFFFF, hash_seed, h, PROGPOW_LANES); + // initialize mix for all lanes using first // two words from header_hash - fill_mix(header, lane_id, mix); + fill_mix(header, hash_seed, lane_id, mix); #pragma unroll 1 for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++) @@ -220,7 +224,7 @@ progpow_search( state[i] = 0; // Run keccak loop - keccak_f800(&state); + keccak_f800(state); // Extract result, swap endianness, and compare with target uint64_t result = (uint64_t)cuda_swab32(state[0]) << 32 | cuda_swab32(state[1]); From fa00bb54225173a57ba511e9ca52ffd2885c5fce Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Tue, 10 Mar 2020 14:50:24 +0100 Subject: [PATCH 08/19] No need to consume more bits in fill_mix bits consumed in fill_mix are only initiators to KIS99 --- libethash-cuda/CUDAMiner_kernel.cu | 35 +++++++++++++++++------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index 5445b5bec..73d13e4ae 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -2,6 +2,9 @@ #define SEARCH_RESULTS 4 #endif +#define FNV_PRIME 0x1000193 +#define FNV_OFFSET_BASIS 0x811c9dc5 + typedef struct { uint32_t count; struct { @@ -79,7 +82,7 @@ __device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x) // Keccak - implemented as a variant of SHAKE // The width is 800, with a bitrate of 576, a capacity of 224, and no padding -__device__ __noinline__ void keccak_f800(uint32_t* st) +__device__ __forceinline__ void keccak_f800(uint32_t* st) { // Assumes input state has already been filled @@ -93,7 +96,7 @@ __device__ __noinline__ void keccak_f800(uint32_t* st) } -#define fnv1a(h, d) (h = (uint32_t(h) ^ uint32_t(d)) * uint32_t(0x1000193)) +#define fnv1a(h, d) (h = (uint32_t(h) ^ uint32_t(d)) * uint32_t(FNV_PRIME)) typedef struct { uint32_t z, w, jsr, jcong; @@ -114,16 +117,16 @@ __device__ __forceinline__ uint32_t kiss99(kiss99_t &st) return ((MWC^st.jcong) + st.jsr); } -__device__ __forceinline__ void fill_mix(const hash32_t header, uint64_t hash_seed, uint32_t lane_id, uint32_t mix[PROGPOW_REGS]) +__device__ __forceinline__ void fill_mix(uint32_t* hash_seed, uint32_t lane_id, uint32_t mix[PROGPOW_REGS]) { // Use FNV to expand the per-warp seed to per-lane // Use KISS to expand the per-lane seed to fill mix - uint32_t fnv_hash = 0x811c9dc5; + uint32_t fnv_hash = FNV_OFFSET_BASIS; kiss99_t st; - st.z = fnv1a(fnv_hash, header.uint32s[0]); - st.w = fnv1a(fnv_hash, header.uint32s[1]); - st.jsr = fnv1a(fnv_hash, ROTL32(hash_seed, lane_id)); - st.jcong = fnv1a(fnv_hash, ROTL32(hash_seed >> 32, lane_id)); + st.z = fnv1a(fnv_hash, hash_seed[0]); + st.w = fnv1a(fnv_hash, hash_seed[1]); + st.jsr = fnv1a(fnv_hash, lane_id); + st.jcong = fnv1a(fnv_hash, lane_id); #pragma unroll for (int i = 0; i < PROGPOW_REGS; i++) mix[i] = kiss99(st); @@ -156,8 +159,9 @@ progpow_search( // Force threads to sync and ensure shared mem is in sync __syncthreads(); - uint32_t state[25]; // Keccak's state - hash32_t digest; // Carry-over from mix output + uint32_t state[25]; // Keccak's state + uint32_t hash_seed[2]; // KISS99 initiator + hash32_t digest; // Carry-over from mix output // Absorb phase for initial round of keccak // 1st fill with header data (8 words) @@ -180,12 +184,12 @@ progpow_search( uint32_t mix[PROGPOW_REGS]; // share the first two words of digest across all lanes - uint64_t hash_seed = (uint64_t)state[0] << 32 | state[1]; - hash_seed = __shfl_sync(0xFFFFFFFF, hash_seed, h, PROGPOW_LANES); + hash_seed[0] = __shfl_sync(0xFFFFFFFF, state[0], h, PROGPOW_LANES); + hash_seed[1] = __shfl_sync(0xFFFFFFFF, state[1], h, PROGPOW_LANES); // initialize mix for all lanes using first // two words from header_hash - fill_mix(header, hash_seed, lane_id, mix); + fill_mix(hash_seed, lane_id, mix); #pragma unroll 1 for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++) @@ -193,7 +197,7 @@ progpow_search( // Reduce mix data to a per-lane 32-bit digest - uint32_t digest_lane = 0x811c9dc5; + uint32_t digest_lane = FNV_OFFSET_BASIS; #pragma unroll for (int i = 0; i < PROGPOW_REGS; i++) fnv1a(digest_lane, mix[i]); @@ -202,7 +206,7 @@ progpow_search( hash32_t digest_temp; #pragma unroll for (int i = 0; i < 8; i++) - digest_temp.uint32s[i] = 0x811c9dc5; + digest_temp.uint32s[i] = FNV_OFFSET_BASIS; for (int i = 0; i < PROGPOW_LANES; i += 8) #pragma unroll @@ -219,6 +223,7 @@ progpow_search( // 2nd subsequent 8 words are carried from digest/mix for (int i = 8; i < 16; i++) state[i] = digest.uint32s[i]; + // 3rd all other elements to zero for (int i = 16; i < 25; i++) state[i] = 0; From d362bea15576b498e1b0b671207969df6668869a Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Tue, 10 Mar 2020 15:15:41 +0100 Subject: [PATCH 09/19] Update README.md --- README.md | 61 ++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index d35967f8f..019d610fe 100644 --- a/README.md +++ b/README.md @@ -214,32 +214,11 @@ As with Ethash the input and output of the keccak function are fixed and relativ Test vectors can be found [in the test vectors file](test-vectors.md#keccak_f800_progpow). ```cpp -hash32_t keccak_f800_progpow(hash32_t header, uint64_t seed, hash32_t digest) +void keccak_f800_progpow(uint32_t* state) { - uint32_t st[25]; - - // Initialization - for (int i = 0; i < 25; i++) - st[i] = 0; - - // Absorb phase for fixed 18 words of input - for (int i = 0; i < 8; i++) - st[i] = header.uint32s[i]; - st[8] = seed; - st[9] = seed >> 32; - for (int i = 0; i < 8; i++) - st[10+i] = digest.uint32s[i]; - // keccak_f800 call for the single absorb pass for (int r = 0; r < 22; r++) keccak_f800_round(st, r); - - // Squeeze phase for fixed 8 words of output - hash32_t ret; - for (int i=0; i<8; i++) - ret.uint32s[i] = st[i]; - - return ret; } ``` @@ -418,11 +397,11 @@ void progPowLoop( ``` The flow of the overall algorithm is: -* A keccak hash of the header + nonce to create a seed -* Use the seed to generate initial mix data +* A keccak hash of the header + nonce to create a digest of 256 bits +* Use first two words of digest as seed to generate initial mix data * Loop multiple times, each time hashing random loads and random math into the mix data * Hash all the mix data into a single 256-bit value -* A final keccak hash is computed +* A final keccak hash using carry-over digest from initial data + mix_data final 256 bit value * When mining this final value is compared against a `hash32_t` target ```cpp @@ -433,13 +412,23 @@ hash32_t progPowHash( const uint32_t *dag // gigabyte DAG located in framebuffer - the first portion gets cached ) { + uint32_t* state[25] = {0}; + uint32_t* seed[2]; uint32_t mix[PROGPOW_LANES][PROGPOW_REGS]; - hash32_t digest; + + // Absorb phase for initial round of keccak + // 1st fill with header data (8 words) for (int i = 0; i < 8; i++) - digest.uint32s[i] = 0; + state[i] = header.uint32s[i]; + // 2nd fill with nonce (2 words) + state[8] = nonce; + state[9] = nonce >> 32; + // 3rd all remaining elements to zero + for (int i = 10; i < 25; i++) + state[i] = 0; // keccak(header..nonce) - hash32_t seed_256 = keccak_f800_progpow(header, nonce, digest); + hash32_t digest_256 = keccak_f800_progpow(state); // endian swap so byte 0 of the hash is the MSB of the value uint64_t seed = ((uint64_t)bswap(seed_256.uint32s[0]) << 32) | bswap(seed_256.uint32s[1]); @@ -464,9 +453,16 @@ hash32_t progPowHash( digest.uint32s[i] = FNV_OFFSET_BASIS; for (int l = 0; l < PROGPOW_LANES; l++) digest.uint32s[l%8] = fnv1a(digest.uint32s[l%8], digest_lane[l]); - - // keccak(header .. keccak(header..nonce) .. digest); - return keccak_f800_progpow(header, seed, digest); + + // Absorb digest into state + for (int i = 8; i < 16; i++) + state[i] = digest.uint32s[i]; + + for (int i = 16; i < 25; i++) + state[i] = 0; + + // keccak(header .. keccak(digest_256 .. digest); + keccak_f800_progpow(state); } ``` @@ -494,7 +490,8 @@ Additional test vectors can be found [in the test vectors file](test-vectors.md# ## Change History -- 0.9.3 (proposed) - Reduce parameters PERIOD, CNT_MATH, and CNT_CACHE. See [this medium post](https://medium.com/@ifdefelse/progpow-progress-da5bb31a651b) for details. +- 0.9.4 (proposed) void the [bypass memory hardness](https://github.com/ifdefelse/ProgPOW/issues/51) vulnerability. +- [0.9.3](https://medium.com/@ifdefelse/progpow-progress-da5bb31a651b) - Reduce parameters PERIOD, CNT_MATH, and CNT_CACHE. - [0.9.2](https://github.com/ifdefelse/ProgPOW/blob/0e39b62deb0c9ab14900fc404fcb19cac70240e1/README.md) - Unique sources for math() and prevent rotation by 0 in merge(). Suggested by [SChernykh](https://github.com/ifdefelse/ProgPOW/issues/19) - [0.9.1](https://github.com/ifdefelse/ProgPOW/blob/60bba1c3fdad6a54539fc3e9f05727547de9c58c/README.md) - Shuffle what part of the DAG entry each lane accesses. Suggested by [mbevand](https://github.com/ifdefelse/ProgPOW/pull/13) - [0.9.0](https://github.com/ifdefelse/ProgPOW/blob/a3f62349a1513f0393524683f9671cfe17cca895/README.md) - Unique cache address sources, re-tune parameters From 04f4bee1223bb59bd874ac1476ecf3d0b760509c Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Tue, 10 Mar 2020 17:07:15 +0100 Subject: [PATCH 10/19] Remove redundant parameter definition --- libethash-cuda/CUDAMiner_kernel.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index 73d13e4ae..71888bfd1 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -117,7 +117,7 @@ __device__ __forceinline__ uint32_t kiss99(kiss99_t &st) return ((MWC^st.jcong) + st.jsr); } -__device__ __forceinline__ void fill_mix(uint32_t* hash_seed, uint32_t lane_id, uint32_t mix[PROGPOW_REGS]) +__device__ __forceinline__ void fill_mix(uint32_t* hash_seed, uint32_t lane_id, uint32_t* mix) { // Use FNV to expand the per-warp seed to per-lane // Use KISS to expand the per-lane seed to fill mix From a91b128bfb42174d09a1ce419d4cbe797c753836 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Mon, 16 Mar 2020 00:02:48 +0100 Subject: [PATCH 11/19] Reduce register pressure and void performance impact --- libethash-cuda/CUDAMiner_kernel.cu | 41 +++++++++++++++++++----------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index 71888bfd1..3698d1ba0 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -159,23 +159,31 @@ progpow_search( // Force threads to sync and ensure shared mem is in sync __syncthreads(); - uint32_t state[25]; // Keccak's state + //uint32_t state[25]; // Keccak's state uint32_t hash_seed[2]; // KISS99 initiator hash32_t digest; // Carry-over from mix output - // Absorb phase for initial round of keccak - // 1st fill with header data (8 words) - for (int i = 0; i < 8; i++) - state[i] = header.uint32s[i]; - // 2nd fill with nonce (2 words) - state[8] = nonce; - state[9] = nonce >> 32; - // 3rd all remaining elements to zero - for (int i = 10; i < 25; i++) - state[i] = 0; + uint32_t state2[8]; - // Run intial keccak round - keccak_f800(state); + { + // Absorb phase for initial round of keccak + // 1st fill with header data (8 words) + uint32_t state[25]; // Keccak's state + for (int i = 0; i < 8; i++) + state[i] = header.uint32s[i]; + // 2nd fill with nonce (2 words) + state[8] = nonce; + state[9] = nonce >> 32; + // 3rd all remaining elements to zero + for (int i = 10; i < 25; i++) + state[i] = 0; + + // Run intial keccak round + keccak_f800(state); + + for (int i = 0; i < 8; i++) + state2[i] = state[i]; + } // Main loop #pragma unroll 1 @@ -184,8 +192,8 @@ progpow_search( uint32_t mix[PROGPOW_REGS]; // share the first two words of digest across all lanes - hash_seed[0] = __shfl_sync(0xFFFFFFFF, state[0], h, PROGPOW_LANES); - hash_seed[1] = __shfl_sync(0xFFFFFFFF, state[1], h, PROGPOW_LANES); + hash_seed[0] = __shfl_sync(0xFFFFFFFF, state2[0], h, PROGPOW_LANES); + hash_seed[1] = __shfl_sync(0xFFFFFFFF, state2[1], h, PROGPOW_LANES); // initialize mix for all lanes using first // two words from header_hash @@ -217,6 +225,9 @@ progpow_search( digest = digest_temp; } + uint32_t state[25]; // Keccak's state + for (int i = 0; i < 8; i++) + state[i] = state2[i]; // Absorb phase for last round of keccak (256 bits) // 1st initial 8 words of state are kept as carry-over from initial keccak From a249464897023a2c3c9263c484188dae98775289 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Tue, 17 Mar 2020 09:05:00 +0100 Subject: [PATCH 12/19] Digest selector index out-of-bounds --- libethash-cuda/CUDAMiner_kernel.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index 3698d1ba0..fb4dbdf02 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -226,14 +226,15 @@ progpow_search( } uint32_t state[25]; // Keccak's state - for (int i = 0; i < 8; i++) - state[i] = state2[i]; // Absorb phase for last round of keccak (256 bits) // 1st initial 8 words of state are kept as carry-over from initial keccak + for (int i = 0; i < 8; i++) + state[i] = state2[i]; + // 2nd subsequent 8 words are carried from digest/mix for (int i = 8; i < 16; i++) - state[i] = digest.uint32s[i]; + state[i] = digest.uint32s[i - 8]; // 3rd all other elements to zero for (int i = 16; i < 25; i++) From 5ae9aecd6e9e2620dfe4f9df6e6aeeda82325278 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Tue, 17 Mar 2020 09:58:28 +0100 Subject: [PATCH 13/19] CUDA - Use initializer instead of loop to fill zeroes LLVM is smart enough to figure out which elements are zeroes and which are copied. --- libethash-cuda/CUDAMiner_kernel.cu | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index fb4dbdf02..a101a3691 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -168,15 +168,12 @@ progpow_search( { // Absorb phase for initial round of keccak // 1st fill with header data (8 words) - uint32_t state[25]; // Keccak's state + uint32_t state[25] = { 0x0 }; // Keccak's state initialized to zero for (int i = 0; i < 8; i++) state[i] = header.uint32s[i]; // 2nd fill with nonce (2 words) state[8] = nonce; state[9] = nonce >> 32; - // 3rd all remaining elements to zero - for (int i = 10; i < 25; i++) - state[i] = 0; // Run intial keccak round keccak_f800(state); @@ -225,7 +222,7 @@ progpow_search( digest = digest_temp; } - uint32_t state[25]; // Keccak's state + uint32_t state[25] = { 0x0 }; // Keccak's state initialized to zero // Absorb phase for last round of keccak (256 bits) // 1st initial 8 words of state are kept as carry-over from initial keccak @@ -236,10 +233,6 @@ progpow_search( for (int i = 8; i < 16; i++) state[i] = digest.uint32s[i - 8]; - // 3rd all other elements to zero - for (int i = 16; i < 25; i++) - state[i] = 0; - // Run keccak loop keccak_f800(state); From cdb4c318f7923b3d020bd94473ce2f9b78c4fb27 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Tue, 17 Mar 2020 11:59:03 +0100 Subject: [PATCH 14/19] Apply keccak absorb constraints --- libethash-cuda/CUDAMiner_kernel.cu | 36 ++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index a101a3691..b6e731098 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -175,6 +175,10 @@ progpow_search( state[8] = nonce; state[9] = nonce >> 32; + // 3rd apply input constraints + state[10] = keccakf_rndc[0]; + state[18] = keccakf_rndc[6]; + // Run intial keccak round keccak_f800(state); @@ -222,22 +226,30 @@ progpow_search( digest = digest_temp; } - uint32_t state[25] = { 0x0 }; // Keccak's state initialized to zero + uint64_t result; - // Absorb phase for last round of keccak (256 bits) - // 1st initial 8 words of state are kept as carry-over from initial keccak - for (int i = 0; i < 8; i++) - state[i] = state2[i]; + { + uint32_t state[25] = { 0x0 }; // Keccak's state initialized to zero - // 2nd subsequent 8 words are carried from digest/mix - for (int i = 8; i < 16; i++) - state[i] = digest.uint32s[i - 8]; + // Absorb phase for last round of keccak (256 bits) + // 1st initial 8 words of state are kept as carry-over from initial keccak + for (int i = 0; i < 8; i++) + state[i] = state2[i]; - // Run keccak loop - keccak_f800(state); + // 2nd subsequent 8 words are carried from digest/mix + for (int i = 8; i < 16; i++) + state[i] = digest.uint32s[i - 8]; - // Extract result, swap endianness, and compare with target - uint64_t result = (uint64_t)cuda_swab32(state[0]) << 32 | cuda_swab32(state[1]); + // 3rd apply input constraints + state[17] = keccakf_rndc[0]; + state[24] = keccakf_rndc[6]; + + // Run keccak loop + keccak_f800(state); + + // Extract result, swap endianness, and compare with target + result = (uint64_t)cuda_swab32(state[0]) << 32 | cuda_swab32(state[1]); + } // keccak(header .. keccak(header..nonce) .. digest); if (result >= target) From 35946a8968a338cb0dbebdc7cbcba3a31ba78c67 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Tue, 17 Mar 2020 19:46:48 +0100 Subject: [PATCH 15/19] [OpenCL] Implement 0.9.4 --- libethash-cl/CLMiner_kernel.cl | 145 +++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 50 deletions(-) diff --git a/libethash-cl/CLMiner_kernel.cl b/libethash-cl/CLMiner_kernel.cl index 9a9b7e8cf..48fadc5a0 100644 --- a/libethash-cl/CLMiner_kernel.cl +++ b/libethash-cl/CLMiner_kernel.cl @@ -17,6 +17,9 @@ #define HASHES_PER_GROUP (GROUP_SIZE / PROGPOW_LANES) +#define FNV_PRIME 0x1000193 +#define FNV_OFFSET_BASIS 0x811c9dc5 + typedef struct { uint32_t uint32s[32 / sizeof(uint32_t)]; @@ -80,31 +83,19 @@ void keccak_f800_round(uint32_t st[25], const int r) // Keccak - implemented as a variant of SHAKE // The width is 800, with a bitrate of 576, a capacity of 224, and no padding // Only need 64 bits of output for mining -uint64_t keccak_f800(__constant hash32_t const* g_header, uint64_t seed, hash32_t digest) +uint64_t keccak_f800(uint32_t* st) { - uint32_t st[25]; - - for (int i = 0; i < 25; i++) - st[i] = 0; - for (int i = 0; i < 8; i++) - st[i] = g_header->uint32s[i]; - st[8] = seed; - st[9] = seed >> 32; - for (int i = 0; i < 8; i++) - st[10+i] = digest.uint32s[i]; - - for (int r = 0; r < 21; r++) { - keccak_f800_round(st, r); - } - // last round can be simplified due to partial output - keccak_f800_round(st, 21); + // Assumes input state has already been filled + // at higher level - // Byte swap so byte 0 of hash is MSB of result - uint64_t res = (uint64_t)st[1] << 32 | st[0]; - return as_ulong(as_uchar8(res).s76543210); + // Complete all 22 rounds as a separate impl to + // evaluate only first 8 words is wasteful of regsters + for (int r = 0; r < 22; r++) { + keccak_f800_round(st, r); + } } -#define fnv1a(h, d) (h = (h ^ d) * 0x1000193) +#define fnv1a(h, d) (h = (h ^ d) * FNV_PRIME) typedef struct { uint32_t z, w, jsr, jcong; @@ -125,14 +116,14 @@ uint32_t kiss99(kiss99_t *st) return ((MWC^st->jcong) + st->jsr); } -void fill_mix(uint64_t seed, uint32_t lane_id, uint32_t mix[PROGPOW_REGS]) +void fill_mix(uint32_t* seed, uint32_t lane_id, uint32_t* mix) { // Use FNV to expand the per-warp seed to per-lane // Use KISS to expand the per-lane seed to fill mix - uint32_t fnv_hash = 0x811c9dc5; + uint32_t fnv_hash = FNV_OFFSET_BASIS; kiss99_t st; - st.z = fnv1a(fnv_hash, seed); - st.w = fnv1a(fnv_hash, seed >> 32); + st.z = fnv1a(fnv_hash, seed[0]); + st.w = fnv1a(fnv_hash, seed[1]); st.jsr = fnv1a(fnv_hash, lane_id); st.jcong = fnv1a(fnv_hash, lane_id); #pragma unroll @@ -168,35 +159,63 @@ __kernel void ethash_search( const uint32_t group_id = lid / PROGPOW_LANES; // Load the first portion of the DAG into the cache - for (uint32_t word = lid*PROGPOW_DAG_LOADS; word < PROGPOW_CACHE_WORDS; word += GROUP_SIZE*PROGPOW_DAG_LOADS) + for (uint32_t word = lid * PROGPOW_DAG_LOADS; word < PROGPOW_CACHE_WORDS; + word += GROUP_SIZE * PROGPOW_DAG_LOADS) { - dag_t load = g_dag[word/PROGPOW_DAG_LOADS]; - for (int i = 0; i> 32; + + // 3rd apply input constraints + state[10] = keccakf_rndc[0]; + state[18] = keccakf_rndc[6]; + + // Run intial keccak round + keccak_f800(state); + + for (int i = 0; i < 8; i++) + state2[i] = state[i]; + } + + +#pragma unroll 1 for (uint32_t h = 0; h < PROGPOW_LANES; h++) { uint32_t mix[PROGPOW_REGS]; // share the hash's seed across all lanes - //uint64_t hash_seed = __shfl_sync(0xFFFFFFFF, seed, h, PROGPOW_LANES); - if (lane_id == h) - share[group_id].uint64s[0] = seed; + if (lane_id == h) + { + share[group_id].uint32s[0] = state2[0]; + share[group_id].uint32s[1] = state2[1]; + } + barrier(CLK_LOCAL_MEM_FENCE); - uint64_t hash_seed = share[group_id].uint64s[0]; + + //uint64_t hash_seed = share[group_id].uint64s[0]; // initialize mix for all lanes - fill_mix(hash_seed, lane_id, mix); + fill_mix(share[group_id].uint32s, lane_id, mix); // Apparently, no unrolling ("#pragma unroll 1") often results in // miscompiles with AMD OpenCL, so use at least 2 @@ -205,7 +224,7 @@ __kernel void ethash_search( progPowLoop(l, mix, g_dag, c_dag, share[0].uint64s, hack_false); // Reduce mix data to a per-lane 32-bit digest - uint32_t mix_hash = 0x811c9dc5; + uint32_t mix_hash = FNV_OFFSET_BASIS; #pragma unroll for (int i = 0; i < PROGPOW_REGS; i++) fnv1a(mix_hash, mix[i]); @@ -213,7 +232,7 @@ __kernel void ethash_search( // Reduce all lanes to a single 256-bit digest hash32_t digest_temp; for (int i = 0; i < 8; i++) - digest_temp.uint32s[i] = 0x811c9dc5; + digest_temp.uint32s[i] = FNV_OFFSET_BASIS; share[group_id].uint32s[lane_id] = mix_hash; barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll @@ -223,8 +242,34 @@ __kernel void ethash_search( digest = digest_temp; } + uint64_t result; + + { + uint32_t state[25] = { 0x0 }; // Keccak's state initialized to zero + + // Absorb phase for last round of keccak (256 bits) + // 1st initial 8 words of state are kept as carry-over from initial keccak + for (int i = 0; i < 8; i++) + state[i] = state2[i]; + + // 2nd subsequent 8 words are carried from digest/mix + for (int i = 8; i < 16; i++) + state[i] = digest.uint32s[i - 8]; + + // 3rd apply input constraints + state[17] = keccakf_rndc[0]; + state[24] = keccakf_rndc[6]; + + // Run keccak loop + keccak_f800(state); + + uint64_t res = (uint64_t)state[1] << 32 | state[0]; + result = as_ulong(as_uchar8(res).s76543210); + + } + // keccak(header .. keccak(header..nonce) .. digest); - if (keccak_f800(g_header, seed, digest) < target) + if (result <= target) { uint slot = atomic_inc(&g_output[0]) + 1; if(slot < MAX_OUTPUTS) @@ -376,25 +421,25 @@ static void keccak_f1600_round(uint2* a, uint r) t[20] = ROL2(a[2], 62); t[5] = ROL2(a[3], 28); t[15] = ROL2(a[4], 27); - + t[16] = ROL2(a[5], 36); t[1] = ROL2(a[6], 44); t[11] = ROL2(a[7], 6); t[21] = ROL2(a[8], 55); t[6] = ROL2(a[9], 20); - + t[7] = ROL2(a[10], 3); t[17] = ROL2(a[11], 10); t[2] = ROL2(a[12], 43); t[12] = ROL2(a[13], 25); t[22] = ROL2(a[14], 39); - + t[23] = ROL2(a[15], 41); t[8] = ROL2(a[16], 45); t[18] = ROL2(a[17], 15); t[3] = ROL2(a[18], 21); t[13] = ROL2(a[19], 8); - + t[14] = ROL2(a[20], 18); t[24] = ROL2(a[21], 2); t[9] = ROL2(a[22], 61); @@ -419,7 +464,7 @@ static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate) // better with surrounding code, however I haven't done this // without causing the AMD compiler to blow up the VGPR usage. - + //uint o = 25; for (uint r = 0; r < 24;) { @@ -435,8 +480,8 @@ static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate) keccak_f1600_round(a, r++); //if (r == 23) o = out_size; } - } - + } + // final round optimised for digest size //keccak_f1600_round(a, 23, out_size); From 21b7812315b92c450e0b691ac82a95400d0a3575 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Thu, 26 Mar 2020 18:40:26 +0100 Subject: [PATCH 16/19] Amend implicit address space conversion --- libethash-cl/CLMiner_kernel.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libethash-cl/CLMiner_kernel.cl b/libethash-cl/CLMiner_kernel.cl index 48fadc5a0..53407300a 100644 --- a/libethash-cl/CLMiner_kernel.cl +++ b/libethash-cl/CLMiner_kernel.cl @@ -116,7 +116,7 @@ uint32_t kiss99(kiss99_t *st) return ((MWC^st->jcong) + st->jsr); } -void fill_mix(uint32_t* seed, uint32_t lane_id, uint32_t* mix) +void fill_mix(local uint32_t* seed, uint32_t lane_id, uint32_t* mix) { // Use FNV to expand the per-warp seed to per-lane // Use KISS to expand the per-lane seed to fill mix From f4b78d00282767643bff0e3d2c45ef27bc17b712 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Thu, 26 Mar 2020 19:31:43 +0100 Subject: [PATCH 17/19] [OpenCL] Fix compile errors --- libethash-cl/CLMiner_kernel.cl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libethash-cl/CLMiner_kernel.cl b/libethash-cl/CLMiner_kernel.cl index 53407300a..6065c9a1c 100644 --- a/libethash-cl/CLMiner_kernel.cl +++ b/libethash-cl/CLMiner_kernel.cl @@ -181,7 +181,7 @@ __kernel void ethash_search( // Absorb phase for initial round of keccak // 1st fill with header data (8 words) for (int i = 0; i < 8; i++) - state[i] = header.uint32s[i]; + state[i] = header->uint32s[i]; // 2nd fill with nonce (2 words) state[8] = nonce; state[9] = nonce >> 32; @@ -263,9 +263,8 @@ __kernel void ethash_search( // Run keccak loop keccak_f800(state); - uint64_t res = (uint64_t)state[1] << 32 | state[0]; + int64_t res = (uint64_t)state[1] << 32 | state[0]; result = as_ulong(as_uchar8(res).s76543210); - } // keccak(header .. keccak(header..nonce) .. digest); From b11671e9560b3c40e124ec89604f4ca13f49f77c Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Thu, 30 Apr 2020 17:55:16 +0200 Subject: [PATCH 18/19] Update 0.9.4 documentation --- README.md | 177 ++++++++----- libethash-cuda/CUDAMiner_kernel.cu | 4 +- test-vectors.md | 409 +++++------------------------ test/kernel.cu | 148 +++++------ test/result.log | 138 ---------- 5 files changed, 255 insertions(+), 621 deletions(-) delete mode 100644 test/result.log diff --git a/README.md b/README.md index 019d610fe..76d10ad8d 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,26 @@ Ethash requires external memory due to the large size of the DAG. However that ## ProgPoW Algorithm Walkthrough -The DAG is generated exactly as in Ethash. All the parameters (epoch length, DAG size, etc) are unchanged. See the original [Ethash](https://github.com/ethereum/wiki/wiki/Ethash) spec for details on generating the DAG. +Up to release 0.9.3 the DAG is generated exactly as in Ethash. All the parameters (epoch length, DAG size, etc) are unchanged. See the original [Ethash](https://github.com/ethereum/wiki/wiki/Ethash) spec for details on generating the DAG. + +Release 0.9.3 has been software and hardware audited: +* [Least Authority — ProgPoW Software Audit PDF](https://leastauthority.com/static/publications/Least%20Authority%20-%20ProgPow%20Algorithm%20Final%20Audit%20Report.pdf) +* [Bob Rao - ProgPoW Hardware Audit PDF](https://github.com/ethereum-cat-herders/progpow-audit/raw/master/Bob%20Rao%20-%20ProgPOW%20Hardware%20Audit%20Report%20Final.pdf) + +Following the suggestion expressed by Least Authority in their findings, new proposed release 0.9.4 introduces a tweak in DAG generation in order to mitigate the possibility of a "Light Evaluation" attack. +This change implies the modification of `ETHASH_DATASET_PARENTS` from a value of 256 to the new value of 512. Due to this the DAG memory file used by ProgPoW is no more compatible with the one used by Ethash (epoch lenght and size increase ratio remain the same though). + +After the audits release a clever finding by [Kik](https://github.com/kik/) disclosed an exploitable condition to [bypass ProgPoW memory hardness](https://github.com/kik/progpow-exploit). Worth to mention the exploit would require the availability of a customized node able to accept modified block headers by the miner. +Purpose of this new spec release is to patch the condition modifying the input state of the last keccak pass so it changes from : +* header (256 bits) + +* seed for mix initiator (64 bits) + +* mix from main loop (256 bits) +* no padding +to +* digest from initial keccak (256 bits) + +* mix from main loop (256 bits) + +* padding +thus widening the constraint to target in keccak [brute force keccak linear searches](https://github.com/kik/progpow-exploit) from 64 to 256 bits. ProgPoW can be tuned using the following parameters. The proposed settings have been tuned for a range of existing, commodity GPUs: @@ -124,21 +143,26 @@ ProgPoW can be tuned using the following parameters. The proposed settings have * `PROGPOW_CNT_CACHE`: The number of cache accesses per loop * `PROGPOW_CNT_MATH`: The number of math operations per loop -The value of these parameters has been tweaked between version 0.9.2 (live on the Gangnam testnet) and 0.9.3 (proposed for Ethereum adoption). See [this medium post](https://medium.com/@ifdefelse/progpow-progress-da5bb31a651b) for details. +The value of these parameters has been tweaked between version 0.9.2 (live on the Gangnam testnet) and 0.9.3 (proposed for [Ethereum adoption](https://github.com/ethereum/EIPs/blob/master/EIPS/eip-1057.md)). See [this medium post](https://medium.com/@ifdefelse/progpow-progress-da5bb31a651b) for details. +Release 0.9.4 keeps the same tunables of 0.9.3 and includes the tweak for DAG generation. -| Parameter | 0.9.2 | 0.9.3 | -|-----------------------|-------|-------| -| `PROGPOW_PERIOD` | `50` | `10` | -| `PROGPOW_LANES` | `16` | `16` | -| `PROGPOW_REGS` | `32` | `32` | -| `PROGPOW_DAG_LOADS` | `4` | `4` | -| `PROGPOW_CACHE_BYTES` | `16x1024` | `16x1024` | -| `PROGPOW_CNT_DAG` | `64` | `64` | -| `PROGPOW_CNT_CACHE` | `12` | `11` | -| `PROGPOW_CNT_MATH` | `20` | `18` | +| Parameter | 0.9.2 | 0.9.3 | 0.9.4 | +|-----------------------|-------|-------|-------| +| `PROGPOW_PERIOD` | `50` | `10` | `10` | +| `PROGPOW_LANES` | `16` | `16` | `16` | +| `PROGPOW_REGS` | `32` | `32` | `32` | +| `PROGPOW_DAG_LOADS` | `4` | `4` | `4` | +| `PROGPOW_CACHE_BYTES` | `16x1024` | `16x1024` | `16x1024` | +| `PROGPOW_CNT_DAG` | `64` | `64` | `64` | +| `PROGPOW_CNT_CACHE` | `12` | `11` | `11` | +| `PROGPOW_CNT_MATH` | `20` | `18` | `18` | +| DAG Parameter | 0.9.2 | 0.9.3 | 0.9.4 | +|--------------------------|-------|-------|-------| +| `ETHASH_DATASET_PARENTS` | `256` | `256` | `512` | -The random program changes every `PROGPOW_PERIOD` blocks (default `50`, roughly 12.5 minutes) to ensure the hardware executing the algorithm is fully programmable. If the program only changed every DAG epoch (roughly 5 days) certain miners could have time to develop hand-optimized versions of the random sequence, giving them an undue advantage. + +The random program changes every `PROGPOW_PERIOD` blocks (default `10`, roughly 2 minutes) to ensure the hardware executing the algorithm is fully programmable. If the program only changed every DAG epoch (roughly 5 days) certain miners could have time to develop hand-optimized versions of the random sequence, giving them an undue advantage. Sample code is written in C++, this should be kept in mind when evaluating the code in the specification. @@ -211,20 +235,25 @@ Like Ethash Keccak is used to seed the sequence per-nonce and to produce the fin As with Ethash the input and output of the keccak function are fixed and relatively small. This means only a single "absorb" and "squeeze" phase are required. For a pseudo-code implementation of the `keccak_f800_round` function see the `Round[b](A,RC)` function in the "Pseudo-code description of the permutations" section of the [official Keccak specs](https://keccak.team/keccak_specs_summary.html). -Test vectors can be found [in the test vectors file](test-vectors.md#keccak_f800_progpow). - ```cpp -void keccak_f800_progpow(uint32_t* state) +hash32_t keccak_f800_progpow(uint32_t* state) { // keccak_f800 call for the single absorb pass for (int r = 0; r < 22; r++) keccak_f800_round(st, r); + + // Squeeze phase for fixed 8 words of output + hash32_t ret; + for (int i=0; i<8; i++) + ret.uint32s[i] = st[i]; + + return ret; } ``` The inner loop uses FNV and KISS99 to generate a random sequence from the `prog_seed`. This random sequence determines which mix state is accessed and what random math is performed. -Since the `prog_seed` changes only once per `PROGPOW_PERIOD` (50 blocks or about 12.5 minutes) it is expected that while mining `progPowLoop` will be evaluated on the CPU to generate source code for that period's sequence. The source code will be compiled on the CPU before running on the GPU. You can see an example sequence and generated source code in [kernel.cu](test/kernel.cu). +Since the `prog_seed` changes only once per `PROGPOW_PERIOD` (10 blocks or about 2 minutes) it is expected that while mining `progPowLoop` will be evaluated on the CPU to generate source code for that period's sequence. The source code will be compiled on the CPU before running on the GPU. You can see an example sequence and generated source code in [kernel.cu](test/kernel.cu). Test vectors can be found [in the test vectors file](test-vectors.md#progPowInit). @@ -397,40 +426,52 @@ void progPowLoop( ``` The flow of the overall algorithm is: -* A keccak hash of the header + nonce to create a digest of 256 bits +* A keccak hash of the header + nonce to create a digest of 256 bits from keccak_f800 (padding is consistent with custom one in ethash) * Use first two words of digest as seed to generate initial mix data * Loop multiple times, each time hashing random loads and random math into the mix data * Hash all the mix data into a single 256-bit value -* A final keccak hash using carry-over digest from initial data + mix_data final 256 bit value +* A final keccak hash using carry-over digest from initial data + mix_data final 256 bit value (padding is consistent with custom one in ethash) * When mining this final value is compared against a `hash32_t` target ```cpp hash32_t progPowHash( - const uint64_t prog_seed, // value is (block_number/PROGPOW_PERIOD) + const uint64_t prog_seed, // value is (block_number/PROGPOW_PERIOD) const uint64_t nonce, const hash32_t header, - const uint32_t *dag // gigabyte DAG located in framebuffer - the first portion gets cached + const uint32_t *dag // gigabyte DAG located in framebuffer - the first portion gets cached ) { - uint32_t* state[25] = {0}; - uint32_t* seed[2]; + hash32_t hash_init; + hash32_t hash_final; + uint32_t mix[PROGPOW_LANES][PROGPOW_REGS]; - // Absorb phase for initial round of keccak - // 1st fill with header data (8 words) - for (int i = 0; i < 8; i++) - state[i] = header.uint32s[i]; - // 2nd fill with nonce (2 words) - state[8] = nonce; - state[9] = nonce >> 32; - // 3rd all remaining elements to zero - for (int i = 10; i < 25; i++) - state[i] = 0; - - // keccak(header..nonce) - hash32_t digest_256 = keccak_f800_progpow(state); - // endian swap so byte 0 of the hash is the MSB of the value - uint64_t seed = ((uint64_t)bswap(seed_256.uint32s[0]) << 32) | bswap(seed_256.uint32s[1]); + /* + ======================================== + Absorb phase for initial keccak pass + ======================================== + */ + + { + uint32_t state[25] = {0x0}; + // 1st fill with header data (8 words) + for (int i = 0; i < 8; i++) + state[i] = header.uint32s[i]; + + // 2nd fill with nonce (2 words) + state[8] = nonce; + state[9] = nonce >> 32; + + // 3rd apply padding + state[10] = 0x00000001; + state[18] = 0x80008081; + + // keccak(header..nonce) + hash_init = keccak_f800_progpow(state); + + // get the seed to initialize mix + seed = ((uint64_t)hash_init.uint32s[1] << 32) | hash_init.uint32s[0]); + } // initialize mix for all lanes for (int l = 0; l < PROGPOW_LANES; l++) @@ -448,49 +489,67 @@ hash32_t progPowHash( for (int i = 0; i < PROGPOW_REGS; i++) digest_lane[l] = fnv1a(digest_lane[l], mix[l][i]); } + // Reduce all lanes to a single 256-bit digest for (int i = 0; i < 8; i++) digest.uint32s[i] = FNV_OFFSET_BASIS; for (int l = 0; l < PROGPOW_LANES; l++) digest.uint32s[l%8] = fnv1a(digest.uint32s[l%8], digest_lane[l]); - - // Absorb digest into state - for (int i = 8; i < 16; i++) - state[i] = digest.uint32s[i]; - - for (int i = 16; i < 25; i++) - state[i] = 0; - - // keccak(header .. keccak(digest_256 .. digest); - keccak_f800_progpow(state); + + /* + ======================================== + Absorb phase for final keccak pass + ======================================== + */ + + { + uint32_t state[25] = {0x0}; + + // 1st fill with hash_init (8 words) + for (int i = 0; i < 8; i++) + state[i] = hash_init.uint32s[i]; + + // 2nd fill with digest from main loop + for (int i = 8; i < 16; i++) + state[i] = digest.uint32s[i - 8]; + + // 3rd apply padding + state[17] = 0x00000001; + state[24] = 0x80008081; + + // keccak(header..nonce) + hash_final = keccak_f800_progpow(state); + } + + // Compare hash final to target + [...] + } ``` - ## Example / Testcase -For ProgPoW 0.9.2: +For ProgPoW 0.9.4: -The random sequence generated for block 30,000 (prog_seed 600) can been seen in [kernel.cu](test/kernel.cu). +The random sequence generated for block 30,000 (prog_seed 3,000) can been seen in [kernel.cu](test/kernel.cu). The algorithm run on block 30,000 produces the following digest and result: ``` -header ffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff -nonce 123456789abcdef0 - -digest: 11f19805c58ab46610ff9c719dcf0a5f18fa2f1605798eef770c47219274767d -result: 5b7ccd472dbefdd95b895cac8ece67ff0deb5a6bd2ecc6e162383d00c3728ece +Header : 0xffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff +Nonce : 0x123456789abcdef0 +Hash init : 0xee304846ddd0a47b98179e96b60ec5ceeae2727834367e593de780e3e6d1892f +Mix seed : 0x7ba4d0dd464830ee +Mix hash : 0x493c13e9807440571511b561132834bbd558dddaa3b70c09515080a6a1aff6d0 +Hash final : 0x46b72b75f238bea3fcfd227e0027dc173dceaa1fb71744bd3d5e030ed2fed053 ``` -A full run showing some intermediate values can be seen in [result.log](test/result.log) - Additional test vectors can be found [in the test vectors file](test-vectors.md#progPowHash). ## Change History -- 0.9.4 (proposed) void the [bypass memory hardness](https://github.com/ifdefelse/ProgPOW/issues/51) vulnerability. +- 0.9.4 (current) - Patch the [bypass memory hardness](https://github.com/ifdefelse/ProgPOW/issues/51) vulnerability. - [0.9.3](https://medium.com/@ifdefelse/progpow-progress-da5bb31a651b) - Reduce parameters PERIOD, CNT_MATH, and CNT_CACHE. - [0.9.2](https://github.com/ifdefelse/ProgPOW/blob/0e39b62deb0c9ab14900fc404fcb19cac70240e1/README.md) - Unique sources for math() and prevent rotation by 0 in merge(). Suggested by [SChernykh](https://github.com/ifdefelse/ProgPOW/issues/19) - [0.9.1](https://github.com/ifdefelse/ProgPOW/blob/60bba1c3fdad6a54539fc3e9f05727547de9c58c/README.md) - Shuffle what part of the DAG entry each lane accesses. Suggested by [mbevand](https://github.com/ifdefelse/ProgPOW/pull/13) diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu index b6e731098..d18025c57 100644 --- a/libethash-cuda/CUDAMiner_kernel.cu +++ b/libethash-cuda/CUDAMiner_kernel.cu @@ -175,7 +175,7 @@ progpow_search( state[8] = nonce; state[9] = nonce >> 32; - // 3rd apply input constraints + // 3rd apply padding state[10] = keccakf_rndc[0]; state[18] = keccakf_rndc[6]; @@ -240,7 +240,7 @@ progpow_search( for (int i = 8; i < 16; i++) state[i] = digest.uint32s[i - 8]; - // 3rd apply input constraints + // 3rd apply padding state[17] = keccakf_rndc[0]; state[24] = keccakf_rndc[6]; diff --git a/test-vectors.md b/test-vectors.md index 0901134e2..2bc303f62 100644 --- a/test-vectors.md +++ b/test-vectors.md @@ -24,7 +24,7 @@ For `z`=`362436069`, `w`=`521288629`, `jsr`=`123456789`, and `jcong`=`380116160` ## fill_mix -For `hash_seed`=`0xEE304846DDD0A47B` and `lane_id`=`0` the values stored in the `mix` array will be +For `seed`=`0xEE304846DDD0A47B` and `lane_id`=`0` the values stored in the `mix` array will be > ``` > 0x10C02F0D, 0x99891C9E, 0xC59649A0, 0x43F0394D, @@ -50,26 +50,6 @@ For the same hash and `lane_id`=`13` the value in the `mix` array will be > 0xB4CCB32D, 0x83A09132, 0x27EA8279, 0x3837DDA3 > ``` -## keccak_f800_progpow - -Test case 1: - -| | | -| -------- | ----------------------------------------------------------------------------------------------------------------- | -| header | `0xCCDDEEFF`, `0x8899AABB`, `0x44556677`, `0x00112233`,
`0x33221100`, `0x77665544`, `0xBBAA9988`, `0xFFEEDDCC` | -| seed | `0x123456789ABCDEF0` | -| digest | `0x00000000`, `0x00000000`, `0x00000000`, `0x00000000`,
`0x00000000`, `0x00000000`, `0x00000000`, `0x00000000` | -| _result_ | `0x464830EE`, `0x7BA4D0DD`, `0x969E1798`, `0xCEC50EB6`,
`0x7872E2EA`, `0x597E3634`, `0xE380E73D`, `0x2F89D1E6` | - -Test case 2: - -| | | -| -------- | ----------------------------------------------------------------------------------------------------------------- | -| header | `0xCCDDEEFF`, `0x8899AABB`, `0x44556677`, `0x00112233`,
`0x33221100`, `0x77665544`, `0xBBAA9988`, `0xFFEEDDCC` | -| seed | `0xEE304846DDD0A47B` | -| digest | `0x0598F111`, `0x66B48AC5`, `0x719CFF10`, `0x5F0ACF9D`,
`0x162FFA18`, `0xEF8E7905`, `0x21470C77`, `0x7D767492` | -| _result_ | `0x47CD7C5B`, `0xD9FDBE2D`, `0xAC5C895B`, `0xFF67CE8E`,
`0x6B5AEB0D`, `0xE1C6ECD2`, `0x003D3862`, `0xCE8E72C3` | - ## progPowInit For ProgPow period 600 (block 30,000) the configurations should be @@ -120,338 +100,81 @@ Kiss 99 state: ## progPowLoop For the first loop iteration of block 30,000 the seed to use for `fill_mix` -would be `0xEE304846DDD0A47B`. A two dimensional `mix` array should be created -passing the rows into `fill_mix` witht he column number as the loop argument. +would be `0x7ba4d0dd464830ee`. A two dimensional `mix` array should be created +passing the rows into `fill_mix` with the column number as the loop argument. The state of the mix array after the call to `progPowLoop` for block 30,000, loop 1 are as follows. -`mix[0]` - - -> ``` -> 0x40E09E9C, 0x967A7DF0, 0x8626BB1F, 0x12C2392F, -> 0xA21D8305, 0x44C2702E, 0x94C93945, 0x6B66B158, -> 0x0CF00FAA, 0x26F5E6B5, 0x36EC0134, 0xC89805AF, -> 0x58118540, 0x8617DC4D, 0xC759F486, 0x8A81E396, -> 0x22443D4D, 0x64291E2F, 0x1998AB7F, 0x11C0FBBB, -> 0xBEA9C139, 0x82D1E47E, 0x7ED3E850, 0x2F81531A, -> 0xBBDFBC4E, 0xF58AEE4D, 0x3CA34321, 0x357BD48A, -> 0x2F9C8B5D, 0x2319B193, 0x2856BB38, 0x2E3C33E6 -> ``` - -`mix[1]` - - -> ``` -> 0x4EB8A8F9, 0xD978BF17, 0x7D5074D4, 0x7A092D5D, -> 0x8682D1BE, 0xC3D2941C, 0xF1A1A38B, 0x54BB6D34, -> 0x2F0FB257, 0xB5464B50, 0x40927B67, 0xBB92A7E1, -> 0x1305A517, 0xE06C6765, 0xA75FD647, 0x9F232D6E, -> 0x0D9213ED, 0x8884671D, 0x54352B96, 0x6772E58E, -> 0x1B8120C9, 0x179F3CFB, 0x116FFC82, 0x6D019BCE, -> 0x1C26A750, 0x89716638, 0x02BEB948, 0x2E0AD5CE, -> 0x7FA915B2, 0x93024F2F, 0x2F58032E, 0xF02E550C -> ``` - -`mix[2]` - - -> ``` -> 0x008FF9BD, 0xC41F9802, 0x2E36FDC8, 0x9FBA2A91, -> 0x0A921670, 0x231308E6, 0xEF09A56E, 0x9657A64A, -> 0xF67723FE, 0x963DCD40, 0x354CBFDB, 0x57C07B9A, -> 0x06AF5B40, 0xBA5DE5A6, 0xDA5AAE7B, 0x9F8A5E4B, -> 0x7D6AFC9A, 0xE4783F78, 0x89B24946, 0x5EE94228, -> 0xA209DAAA, 0xDCC27C64, 0x3366FBED, 0x0FEFB673, -> 0x0FC205E3, 0xB61515B2, 0x70A45E9B, 0xBB225E5D, -> 0xB8C38EA0, 0xE01DE9B4, 0x866FAA5B, 0x1A125220 -> ``` - -`mix[3]` - - -> ``` -> 0xE5F9C5CC, 0x6F75CFA2, 0xE0F50924, 0xE7B4F5EF, -> 0x779B903D, 0x5F068253, 0x05FF68E5, 0x39348653, -> 0x654B89E4, 0x0559769E, 0xA3D46B93, 0xD084454D, -> 0xCFC5CF7D, 0x8C11D8E4, 0x795BDB59, 0xD9E03113, -> 0xBAE8C355, 0x12B63814, 0x4046A018, 0xA269A32E, -> 0x54A57C4B, 0x2ED1065B, 0xB69A2C76, 0x4AEF0950, -> 0x6C2D187B, 0x8252FAE7, 0x3E9C0ED2, 0x26E47B15, -> 0xFEFB48E3, 0xDA088C7F, 0xA82B0379, 0xA49C6D86 -> ``` - -`mix[4]` - - -> ``` -> 0xB926334C, 0x686A29AF, 0xD9E2EF15, 0x1C8A2D39, -> 0x307ED4F4, 0x2ABB1DB6, 0xD6F95128, 0xDFCA05F8, -> 0x904D9472, 0xEC09E200, 0x7143F47F, 0xEE488438, -> 0xFCA48DA8, 0xA64C7DD4, 0xC4AE9A30, 0xEBA30BC9, -> 0xB02630BF, 0xD1DF40CC, 0x4DFE8B7B, 0x205C97B3, -> 0xE40376F8, 0x2491117E, 0x34984321, 0xA01546A7, -> 0xB254F2F9, 0xC78A7C25, 0xFFC615E2, 0x5839FC88, -> 0x2A04DF6C, 0xC02A9A8A, 0x39238EAD, 0x7139060C -> ``` - -`mix[5]` - - -> ``` -> 0xC416E54B, 0x64AD1C57, 0xBF7CBA55, 0x176F714E, -> 0xBE733426, 0x995C4132, 0x5F50F779, 0x0F76FDF3, -> 0x526F7870, 0xE56A1A8A, 0xDCEB677E, 0xD471CC19, -> 0xA9ED60E4, 0x145E807F, 0x8D652E92, 0x80E8116F, -> 0xFF1A37EB, 0x1E0C49A1, 0x59D756DA, 0x39A8E761, -> 0x2F0F646F, 0x43F41278, 0x88CC48DA, 0x8FDFF7A4, -> 0x9AEACA2E, 0x59E7808C, 0x7F72E46B, 0xCA572333, -> 0xC6029C88, 0x7736E592, 0xF1338231, 0x262B2C7F -> ``` - -`mix[6]` - - -> ``` -> 0x3C554151, 0x70999423, 0x64BB49A8, 0xF9EBE9E9, -> 0x7D9C28CF, 0x23EE7659, 0xD6504FCF, 0x1C58C2A1, -> 0x62B9C627, 0x680AE248, 0xF196A153, 0x2A3C345A, -> 0x860E6EB2, 0x266D2652, 0x3C9F2420, 0xF790A538, -> 0x710A5523, 0xBEA2603A, 0x1C1CC272, 0xF91D482A, -> 0x1CA19931, 0x7A80ED37, 0x9572513D, 0x376F1CFE, -> 0xE57C1264, 0xE47BF931, 0xC7310E05, 0x7866CC9E, -> 0xC676BBD5, 0x4C167FEB, 0x0FE03D2B, 0x46C6D26C -> ``` - -`mix[7]` - - -> ``` -> 0x3395F65A, 0x7142A5B1, 0x97780661, 0xE5EE45B8, -> 0xCD9FDC42, 0x25BF044C, 0x0350F81B, 0x55D50703, -> 0xA8CB893E, 0xEE795201, 0xC2D6E598, 0xC2AC2D7A, -> 0xD2E81716, 0xAD876790, 0x0F3339C7, 0xEEC31E01, -> 0xA293ABF6, 0x28AE317D, 0x44A7AC05, 0xBEBA1C5E, -> 0x325ED29E, 0x4344131E, 0x921CD8DD, 0x08AB9E0B, -> 0xC18E66A6, 0x87E6BCA3, 0x24CE82AE, 0xC910B4F1, -> 0x9E513EC0, 0xA1B8CB76, 0xF0455815, 0x36BC0DCF -> ``` - -`mix[8]` - - -> ``` -> 0x0117C85F, 0xE018F2C6, 0x416C897D, 0x9D288A0F, -> 0x2AA9EA93, 0x5A6D3CEA, 0xAA99B726, 0x0A42DAB7, -> 0x72F6EA4A, 0x1DB074E6, 0x2E2A606C, 0xAC5D509B, -> 0x53F13E85, 0x1D44B521, 0x24234C42, 0xAD5BAD70, -> 0xAB2DA791, 0x6479546A, 0xD27B3771, 0xBB0A09DD, -> 0x6D3C8056, 0x96572D4B, 0x52DB6535, 0x3D242BC1, -> 0xF37D7C7A, 0xA60F7111, 0x59B59667, 0xF28635B0, -> 0xC2A8F9F5, 0x7CFB9CCB, 0xDF8697AA, 0xA3260D94 -> ``` - -`mix[9]` - - -> ``` -> 0xA387FC4B, 0xC757D3A0, 0xA584E879, 0xB0A1EC29, -> 0x82CB2EC3, 0x6BF33664, 0x41FECC42, 0xF60C2AC5, -> 0xEA250BE5, 0x42BE9F33, 0x9227B0B3, 0x9080A6AB, -> 0xAF193598, 0xC708BC8A, 0x020CDEDB, 0x7FA2F773, -> 0x4338E670, 0x069E0242, 0x5AD87326, 0xD7A87124, -> 0x220D5C46, 0x26D3400D, 0x4899D1EE, 0x90EAD2F6, -> 0xFA3F1F74, 0x9C5A5D58, 0xAE20567C, 0x424B690D, -> 0xC9A4057A, 0x9F2A5CD1, 0xAA33CD5F, 0x18F58C00 -> ``` - -`mix[10]` - - -> ``` -> 0xEAFE893C, 0x1ABB2971, 0x29803BB3, 0x5BC2F71F, -> 0x619DAFAD, 0xD9CFEFB6, 0xB4FEFAB5, 0x5EB249EC, -> 0x1A6E2B3A, 0xFB05DD28, 0xDCB33C2E, 0x630BB8AE, -> 0x43463B39, 0x3BD2F552, 0xFB20C0A2, 0x3383BA34, -> 0x2E9C1A99, 0x60A949B2, 0x861372AB, 0xC149D929, -> 0xA77A0A93, 0xE0CEE0D9, 0x791E7E82, 0x66A8D75A, -> 0x44D1845F, 0xE534DC4A, 0x2C7DD20C, 0xEEDAB329, -> 0x3209FE2A, 0x0C0406BC, 0xD6D4BD2A, 0x5FDB13CC -> ``` - -`mix[11]` - - -> ``` -> 0x2520ABB3, 0xCD942485, 0x9A2929BC, 0x0E10F18C, -> 0xDFB1815E, 0x8BEF05A3, 0x531A8837, 0x668838E4, -> 0xBACCE200, 0x003F85C2, 0x56226F05, 0xC2233173, -> 0x2F39A0D9, 0xF4466D0D, 0x0B9E686C, 0x82C69BDA, -> 0x0C8A8CD6, 0xA93F3001, 0x36A65EC1, 0x40CCFD7A, -> 0x84484E23, 0xF0896D45, 0x06D9F760, 0x6559142C, -> 0x9FFE2E88, 0x9593DC89, 0x89C9E3B9, 0x33285F41, -> 0x16F636C8, 0xA08169C7, 0xA5E1C956, 0xC22CCF52 -> ``` - -`mix[12]` - - -> ``` -> 0xDC3B8CAA, 0xC6941197, 0x9969D596, 0x46453D3E, -> 0x568EAFEA, 0x5B823345, 0xDE606E8E, 0x7523C86D, -> 0x0EDAF441, 0x00C3D848, 0xAE5BAB99, 0xD705B9EE, -> 0x54B49E3D, 0xF364A6A4, 0x42C55975, 0xFE41EED5, -> 0xAD46170F, 0xAABE4868, 0x270379F9, 0xD33D0D7C, -> 0xF39C476C, 0xA449118E, 0x71BCC1E4, 0x5E300E77, -> 0x1CACD489, 0x4D82FABD, 0x090F9F80, 0xB2DB9626, -> 0xE12A973B, 0x1B77460C, 0xD25F89F5, 0x5753612E -> ``` - -`mix[13]` - - -> ``` -> 0x042D951C, 0x38833AA7, 0xBEA9894D, 0x7AE7F381, -> 0x42DB6723, 0x1FB0294F, 0x41452A28, 0xA7A97B9C, -> 0x228AA7EA, 0x781A7420, 0x4589736D, 0xB3C19349, -> 0x685EF9E6, 0xB4987DF6, 0xC9C3B188, 0x2DCA6A03, -> 0xE89A6D3D, 0x50EF7CF5, 0xF6274868, 0x8AA22824, -> 0x980FFDE3, 0xD4A6CB4E, 0x06FF9E1A, 0xBADB6DF5, -> 0xEDE3ADF3, 0xC9CF45F6, 0xFDFA194C, 0xAF076AA8, -> 0x7B876CEA, 0xB0C89575, 0x35A72155, 0x6CFDFC06 -> ``` - -`mix[14]` - - -> ``` -> 0x0E3E28C8, 0xEC329DEC, 0x06D0A1D1, 0xF95ABEF8, -> 0x168DCF28, 0xDD7714C1, 0x769C119E, 0xA5530A7D, -> 0x1EEACB59, 0x30FD21BB, 0x082A3691, 0x1C4C9BCA, -> 0x420F27DE, 0xA8FDA3AE, 0xE182142E, 0x5102F0FF, -> 0x15B82277, 0x120C3217, 0x7BE714ED, 0xA251DCD5, -> 0x6FB4F831, 0xB71D7B32, 0xD5F7A04A, 0x763E1A20, -> 0x38E68B0C, 0xBB5A4121, 0x9340BF06, 0x948B03F8, -> 0xE71BF17B, 0x1BB5F06B, 0x26F2A200, 0x5F28C415 -> ``` - -`mix[15]` - - -> ``` -> 0xC818CD64, 0xBC910343, 0xB18B7776, 0x7182DEBA, -> 0x9DB319EE, 0x9AE7F32F, 0x3CA9F8B5, 0xC63F48ED, -> 0x8321533A, 0x059C96B1, 0x8DCDA60A, 0x75B6C1D1, -> 0xC3406B57, 0x3DFE9E9B, 0xC01E1FD7, 0xC4643218, -> 0x6873F0BA, 0x8ABD36B9, 0xA74D0CBD, 0x8A637118, -> 0x6916416C, 0xB6E3A8DD, 0xB68DD4FA, 0xFBD543EE, -> 0x56F05592, 0x33D6DB82, 0x58D0A7DD, 0x18630C6E, -> 0xB33749CA, 0x5D2E87F7, 0x0F3C39DB, 0x3CAE9895 -> ``` +[TO DO] ## progPowHash -Block 30000: - -- `prog_seed` - 600 -- `nonce` - `123456789abcdef0` -- `header` - `ffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff` -- _digest_ - `11f19805c58ab46610ff9c719dcf0a5f18fa2f1605798eef770c47219274767d` -- _result_ - `5b7ccd472dbefdd95b895cac8ece67ff0deb5a6bd2ecc6e162383d00c3728ece` - Block 0: - -- `prog_seed` - 0 -- `nonce` - `0000000000000000` -- `header` - `0000000000000000000000000000000000000000000000000000000000000000` -- _digest_ - `faeb1be51075b03a4ff44b335067951ead07a3b078539ace76fd56fc410557a3` -- _result_ - `63155f732f2bf556967f906155b510c917e48e99685ead76ea83f4eca03ab12` +``` +Header : 0xffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff +Nonce : 0x123456789abcdef0 +Hash init : 0xee304846ddd0a47b98179e96b60ec5ceeae2727834367e593de780e3e6d1892f +Mix seed : 0x7ba4d0dd464830ee +Mix hash : 0xc2e883b6876ec4cc514b9cea269f343095619faf9f2edcafb3fcf6928fa58141 +Hash final : 0xfa70fbf9979f80ec3db2c3f118a5e683fcf5f54ea7edc41b0b5d336508694cb8 + +Header : 0xffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff +Nonce : 0x0000000000000001 +Hash init : 0xc69d81127f23e360ee0e7ca302fdc1f9e25a772a3728ec48be074a3d398e7677 +Mix seed : 0x60e3237f12819dc6 +Mix hash : 0xab94222a9736b2c93282fbf6e7217f792f87504033a83eb5501beb24f0d235e7 +Hash final : 0x1260d102572f6ab9840556e8766ba670b511bcf768767b0ab05af45ea9fbad8d +``` Block 49: - -- `prog_seed` - 0 -- `nonce` - `0000000006ff2c47` -- `header` - `63155f732f2bf556967f906155b510c917e48e99685ead76ea83f4eca03ab12b` -- _digest_ - `c789c1180f890ec555ff42042913465481e8e6bc512cb981e1c1108dc3f2227d` -- _result_ - `9e7248f20914913a73d80a70174c331b1d34f260535ac3631d770e656b5dd92` - -Block 50: - -- `prog_seed` - 1 -- `nonce` - `00000000076e482e` -- `header` - `9e7248f20914913a73d80a70174c331b1d34f260535ac3631d770e656b5dd922` -- _digest_ - `c7340542c2a06b3a7dc7222635f7cd402abf8b528ae971ddac6bbe2b0c7cb518` -- _result_ - `de37e1824c86d35d154cf65a88de6d9286aec4f7f10c3fc9f0fa1bcc2687188` - -Block 99: - -- `prog_seed` - 1 -- `nonce` - `000000003917afab` -- `header` - `de37e1824c86d35d154cf65a88de6d9286aec4f7f10c3fc9f0fa1bcc2687188d` -- _digest_ - `f5e60b2c5bfddd136167a30cbc3c8dbdbd15a512257dee7964e0bc6daa9f8ba7` -- _result_ - `ac7b55e801511b77e11d52e9599206101550144525b5679f2dab19386f23dcc` - -Block 29,950: - -- `prog_seed` - 599 -- `nonce` - `005d409dbc23a62a` -- `header` - `ac7b55e801511b77e11d52e9599206101550144525b5679f2dab19386f23dcce` -- _digest_ - `07393d15805eb08ee6fc6cb3ad4ad1010533bd0ff92d6006850246829f18fd6e` -- _result_ - `e43d7e0bdc8a4a3f6e291a5ed790b9fa1a0948a2b9e33c844888690847de19f` - -Block 29,999: - -- `prog_seed` - 599 -- `nonce` - `005db5fa4c2a3d03` -- `header` - `e43d7e0bdc8a4a3f6e291a5ed790b9fa1a0948a2b9e33c844888690847de19f5` -- _digest_ - `7551bddf977491da2f6cfc1679299544b23483e8f8ee0931c4c16a796558a0b8` -- _result_ - `d34519f72c97cae8892c277776259db3320820cb5279a299d0ef1e155e5c645` +``` +Header : 0xffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff +Nonce : 0x123456789abcdef0 +Hash init : 0xee304846ddd0a47b98179e96b60ec5ceeae2727834367e593de780e3e6d1892f +Mix seed : 0x7ba4d0dd464830ee +Mix hash : 0xa0e00c15ccff10aefeeef6ca28260807fdd7f2daaff7948b15857e3a65908f09 +Hash final : 0xa66465873e0674e95ac58efba116458342b3252abeb47874adaf139843ef79bb + +Header : 0xffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff +Nonce : 0x0000000000000001 +Hash init : 0xc69d81127f23e360ee0e7ca302fdc1f9e25a772a3728ec48be074a3d398e7677 +Mix seed : 0x60e3237f12819dc6 +Mix hash : 0x1704a993e5a8603615b964990253896681da83ddd10c0e6e8fee2f273fa2a961 +Hash final : 0x528dff2f543825030a8e0943013de7bc6a4b7c203c7398607811176b03ce70f4 +``` + +Block 14999: +``` +Header : 0xffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff +Nonce : 0x123456789abcdef0 +Hash init : 0xee304846ddd0a47b98179e96b60ec5ceeae2727834367e593de780e3e6d1892f +Mix seed : 0x7ba4d0dd464830ee +Mix hash : 0xfbbed3db6316658244eef0a897a901fdb40956de9439cf15a74582427443d3bc +Hash final : 0xcaaa67746a4a26c102580851c4f8542f455cd97c6f2749de216c4425504d53c4 + +Header : 0xffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff +Nonce : 0x0000000000000001 +Hash init : 0xc69d81127f23e360ee0e7ca302fdc1f9e25a772a3728ec48be074a3d398e7677 +Mix seed : 0x60e3237f12819dc6 +Mix hash : 0x0a66b4d37962836650099ad914d2688ffb5dc8688424256cf177c3e7b3f85e88 +Hash final : 0x4bd39ef9155cfd42f0ebb486ee7097d08f793147a9d157027db3d188770ac29d +``` Block 30,000: +``` +Header : 0xffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff +Nonce : 0x123456789abcdef0 +Hash init : 0xee304846ddd0a47b98179e96b60ec5ceeae2727834367e593de780e3e6d1892f +Mix seed : 0x7ba4d0dd464830ee +Mix hash : 0x493c13e9807440571511b561132834bbd558dddaa3b70c09515080a6a1aff6d0 +Hash final : 0x46b72b75f238bea3fcfd227e0027dc173dceaa1fb71744bd3d5e030ed2fed053 + +Header : 0xffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff +Nonce : 0x0000000000000001 +Hash init : 0xc69d81127f23e360ee0e7ca302fdc1f9e25a772a3728ec48be074a3d398e7677 +Mix seed : 0x60e3237f12819dc6 +Mix hash : 0x67ed6cc54b89262d1ac9c3f24ad0d2362cb703c1e23881713d0350ca3035e2ae +Hash final : 0xad8fa791bfb2f474a3487d27075bf339d73e0e69d62ecab14166add94c8d0f92 +``` -- `prog_seed` - 600 -- `nonce` - `005db8607994ff30` -- `header` - `d34519f72c97cae8892c277776259db3320820cb5279a299d0ef1e155e5c6454` -- _digest_ - `f1c2c7c32266af9635462e6ce1c98ebe4e7e3ecab7a38aaabfbf2e731e0fbff4` -- _result_ - `8b6ce5da0b06d18db7bd8492d9e5717f8b53e7e098d9fef7886d58a6e913ef6` - -Block 30,049: - -- `prog_seed` - 600 -- `nonce` - `005e2e215a8ca2e7` -- `header` - `8b6ce5da0b06d18db7bd8492d9e5717f8b53e7e098d9fef7886d58a6e913ef64` -- _digest_ - `57fe6a9fbf920b4e91deeb66cb0efa971e08229d1a160330e08da54af0689add` -- _result_ - `c2c46173481b9ced61123d2e293b42ede5a1b323210eb2a684df0874ffe0904` - -Block 30,050: - -- `prog_seed` - 601 -- `nonce` - `005e30899481055e` -- `header` - `c2c46173481b9ced61123d2e293b42ede5a1b323210eb2a684df0874ffe09047` -- _digest_ - `ba30c61cc5a2c74a5ecaf505965140a08f24a296d687e78720f0b48baf712f2d` -- _result_ - `ea42197eb2ba79c63cb5e655b8b1f612c5f08aae1a49ff236795a3516d87bc7` - -Block 30,099: - -- `prog_seed` - 601 -- `nonce` - `005ea6aef136f88b` -- `header` - `ea42197eb2ba79c63cb5e655b8b1f612c5f08aae1a49ff236795a3516d87bc71` -- _digest_ - `cfd5e46048cd133d40f261fe8704e51d3f497fc14203ac6a9ef6a0841780b1cd` -- _result_ - `49e15ba4bf501ce8fe8876101c808e24c69a859be15de554bf85dbc095491bd` - -Block 59,950: - -- `prog_seed` - 1,199 -- `nonce` - `02ebe0503bd7b1da` -- `header` - `49e15ba4bf501ce8fe8876101c808e24c69a859be15de554bf85dbc095491bd6` -- _digest_ - `21511fbaa31fb9f5fc4998a754e97b3083a866f4de86fa7500a633346f56d773` -- _result_ - `f5c50ba5c0d6210ddb16250ec3efda178de857b2b1703d8d5403bd0f848e19c` - -Block 59,999: - -- `prog_seed` - 1,199 -- `nonce` - `02edb6275bd221e3` -- `header` - `f5c50ba5c0d6210ddb16250ec3efda178de857b2b1703d8d5403bd0f848e19cf` -- _digest_ - `653eda37d337e39d311d22be9bbd3458d3abee4e643bee4a7280a6d08106ef98` -- _result_ - `341562d10d4afb706ec2c8d5537cb0c810de02b4ebb0a0eea5ae335af6fb2e8` - -Block 10,000,000: - -- `prog_seed` - 200,000 -- `nonce` - `005e30899481055e` -- `header` - `efda178de857b2b1703d8d5403bd0f848e19cff5c50ba5c0d6210ddb16250ec3` -- _digest_ - `b2403f56c426177856eaf0eedd707c86ae78a432b9169c3689a67058fcf2a848` -- _result_ - `206aee640c0fd21473d5cc3654d63c80442d9e2dfa676d2801d3ec1fbab38a6d` - -Block 100,000,000: - -- `prog_seed` - 2,000,000 -- `nonce` - `02abe0589481055e` -- `header` - `49e15ba4bf501ce8fe88765403bd0f848e19cff5c50ba5c0d6210ddb16250ec3` -- _digest_ - `ac452084d6f4e6eacf4282ad58dbd4ce7ef2653fb5e6b5c877f56928c907432a` -- _result_ - `b879f84923e71b812ef5a42ece0b5b9366c31cab218f40afe65f8a2cae448a6f` diff --git a/test/kernel.cu b/test/kernel.cu index 7aa147310..4fe7b83f6 100644 --- a/test/kernel.cu +++ b/test/kernel.cu @@ -1,133 +1,123 @@ -// Inner loop for prog_seed 600 -__device__ __forceinline__ void progPowLoop(const uint32_t loop, +// Inner loop for prog_seed 3,000 +DEV_INLINE void progPowLoop(const uint32_t loop, uint32_t mix[PROGPOW_REGS], - const dag_t *g_dag, + const dag_t* g_dag, const uint32_t c_dag[PROGPOW_CACHE_WORDS], - const bool hack_false) + const bool hack_false, + const uint32_t lane_id) { dag_t data_dag; uint32_t offset, data; - const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES - 1); // global load - offset = __shfl_sync(0xFFFFFFFF, mix[0], loop%PROGPOW_LANES, PROGPOW_LANES); + offset = _SHFL(mix[0], loop & (PROGPOW_LANES - 1), PROGPOW_LANES); offset %= PROGPOW_DAG_ELEMENTS; - offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES; + offset = offset * PROGPOW_LANES + ((lane_id ^ loop) & (PROGPOW_LANES - 1)); data_dag = g_dag[offset]; // hack to prevent compiler from reordering LD and usage if (hack_false) __threadfence_block(); // cache load 0 - offset = mix[26] % PROGPOW_CACHE_WORDS; + offset = mix[12] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[0] = (mix[0] ^ data) * 33; + mix[26] = ROTR32(mix[26], 17) ^ data; // random math 0 - data = mix[10] ^ mix[16]; - mix[4] = ROTL32(mix[4], 27) ^ data; + data = mix[13] ^ mix[3]; + mix[9] = ROTL32(mix[9], 17) ^ data; // cache load 1 - offset = mix[30] % PROGPOW_CACHE_WORDS; + offset = mix[1] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[27] = ROTR32(mix[27], 7) ^ data; + mix[15] = ROTL32(mix[15], 15) ^ data; // random math 1 - data = mix[24] & mix[14]; - mix[26] = (mix[26] * 33) + data; + data = mix[24] ^ mix[10]; + mix[16] = (mix[16] * 33) + data; // cache load 2 - offset = mix[1] % PROGPOW_CACHE_WORDS; + offset = mix[29] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[13] = (mix[13] * 33) + data; + mix[25] = (mix[25] ^ data) * 33; // random math 2 - data = mix[17] & mix[16]; - mix[15] = ROTR32(mix[15], 12) ^ data; + data = ROTL32(mix[4], mix[12]); + mix[12] = ROTR32(mix[12], 13) ^ data; // cache load 3 - offset = mix[19] % PROGPOW_CACHE_WORDS; + offset = mix[6] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[17] = (mix[17] ^ data) * 33; + mix[7] = ROTL32(mix[7], 8) ^ data; // random math 3 - data = mul_hi(mix[31], mix[5]); - mix[7] = (mix[7] ^ data) * 33; + data = mix[8] * mix[24]; + mix[31] = (mix[31] ^ data) * 33; // cache load 4 - offset = mix[11] % PROGPOW_CACHE_WORDS; + offset = mix[11] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[14] = (mix[14] ^ data) * 33; + mix[27] = ROTL32(mix[27], 2) ^ data; // random math 4 - data = mix[23] * mix[19]; - mix[8] = (mix[8] * 33) + data; + data = popcount(mix[28]) + popcount(mix[17]); + mix[5] = (mix[5] * 33) + data; // cache load 5 - offset = mix[21] % PROGPOW_CACHE_WORDS; + offset = mix[18] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[9] = (mix[9] ^ data) * 33; + mix[11] = ROTR32(mix[11], 28) ^ data; // random math 5 - data = clz(mix[30]) + clz(mix[15]); - mix[12] = ROTR32(mix[12], 16) ^ data; + data = mix[31] ^ mix[12]; + mix[17] = (mix[17] ^ data) * 33; // cache load 6 - offset = mix[15] % PROGPOW_CACHE_WORDS; + offset = mix[8] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[3] = ROTR32(mix[3], 27) ^ data; + mix[29] = ROTR32(mix[29], 10) ^ data; // random math 6 - data = clz(mix[12]) + clz(mix[5]); + data = popcount(mix[4]) + popcount(mix[12]); mix[10] = (mix[10] * 33) + data; // cache load 7 - offset = mix[18] % PROGPOW_CACHE_WORDS; + offset = mix[14] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[1] = ROTR32(mix[1], 6) ^ data; + mix[6] = (mix[6] ^ data) * 33; // random math 7 - data = min(mix[4], mix[25]); - mix[11] = ROTR32(mix[11], 27) ^ data; + data = min(mix[10], mix[20]); + mix[24] = (mix[24] * 33) + data; // cache load 8 - offset = mix[3] % PROGPOW_CACHE_WORDS; + offset = mix[17] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[6] = (mix[6] ^ data) * 33; + mix[14] = (mix[14] ^ data) * 33; // random math 8 - data = mul_hi(mix[18], mix[16]); - mix[16] = (mix[16] ^ data) * 33; + data = mix[0] * mix[10]; + mix[19] = ROTR32(mix[19], 23) ^ data; // cache load 9 - offset = mix[17] % PROGPOW_CACHE_WORDS; + offset = mix[9] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[28] = ROTL32(mix[28], 17) ^ data; + mix[23] = (mix[23] * 33) + data; // random math 9 - data = ROTL32(mix[15], mix[23]); - mix[31] = (mix[31] * 33) + data; + data = min(mix[22], mix[28]); + mix[1] = ROTR32(mix[1], 4) ^ data; // cache load 10 - offset = mix[31] % PROGPOW_CACHE_WORDS; + offset = mix[0] & (PROGPOW_CACHE_WORDS - 1); data = c_dag[offset]; - mix[2] = (mix[2] * 33) + data; + mix[18] = (mix[18] ^ data) * 33; // random math 10 - data = mix[11] | mix[17]; - mix[19] = ROTL32(mix[19], 28) ^ data; - // cache load 11 - offset = mix[16] % PROGPOW_CACHE_WORDS; - data = c_dag[offset]; - mix[30] = ROTR32(mix[30], 18) ^ data; + data = ROTL32(mix[22], mix[9]); + mix[21] = ROTR32(mix[21], 5) ^ data; // random math 11 - data = mix[22] * mix[7]; - mix[22] = ROTR32(mix[22], 30) ^ data; + data = min(mix[26], mix[4]); + mix[22] = (mix[22] * 33) + data; // random math 12 - data = mix[27] & mix[16]; - mix[29] = ROTR32(mix[29], 25) ^ data; + data = min(mix[19], mix[30]); + mix[8] = ROTL32(mix[8], 26) ^ data; // random math 13 - data = ROTL32(mix[11], mix[0]); - mix[5] = (mix[5] ^ data) * 33; + data = mix[12] ^ mix[24]; + mix[3] = ROTL32(mix[3], 30) ^ data; // random math 14 - data = ROTR32(mix[15], mix[25]); - mix[24] = ROTL32(mix[24], 13) ^ data; + data = min(mix[8], mix[13]); + mix[28] = ROTL32(mix[28], 31) ^ data; // random math 15 - data = mix[14] & mix[26]; - mix[18] = (mix[18] * 33) + data; + data = ROTL32(mix[12], mix[9]); + mix[30] = ROTL32(mix[30], 31) ^ data; // random math 16 - data = mix[28] * mix[16]; - mix[25] = (mix[25] ^ data) * 33; + data = ROTL32(mix[28], mix[27]); + mix[2] = (mix[2] * 33) + data; // random math 17 - data = mix[11] * mix[0]; - mix[23] = (mix[23] ^ data) * 33; - // random math 18 - data = mix[2] + mix[24]; - mix[21] = ROTR32(mix[21], 20) ^ data; - // random math 19 - data = mix[25] + mix[4]; - mix[20] = ROTL32(mix[20], 22) ^ data; + data = ROTL32(mix[30], mix[28]); + mix[20] = ROTL32(mix[20], 12) ^ data; // consume global load data // hack to prevent compiler from reordering LD and usage if (hack_false) __threadfence_block(); - mix[0] = (mix[0] ^ data_dag.s[0]) * 33; - mix[0] = ROTR32(mix[0], 21) ^ data_dag.s[1]; - mix[4] = (mix[4] * 33) + data_dag.s[2]; - mix[27] = (mix[27] ^ data_dag.s[3]) * 33; -} \ No newline at end of file + mix[0] = (mix[0] * 33) + data_dag.words[0]; + mix[4] = ROTL32(mix[4], 13) ^ data_dag.words[1]; + mix[13] = (mix[13] ^ data_dag.words[2]) * 33; + mix[0] = ROTR32(mix[0], 12) ^ data_dag.words[3]; +} diff --git a/test/result.log b/test/result.log deleted file mode 100644 index 9c9695cd6..000000000 --- a/test/result.log +++ /dev/null @@ -1,138 +0,0 @@ ->ethminer.exe -U -M 30000 - m 00:37:26|main | ethminer version 0.15.0.dev0 - m 00:37:26|main | Build: windows / release +git. 2c02a51 - cu 00:37:26|main | Using grid size 1024 , block size 512 -Benchmarking on platform: CUDA -Preparing DAG for block #30000 - i Warming up...00:37:26|cuda-0 | No work. - - i 00:37:26|cuda-0 | Initialising miner 0 - cu 00:37:26|cuda-0 | Using device: GeForce GTX 1060 6GB (Compute 6.1) - cu 00:37:26|cuda-0 | Set Device to current - cu 00:37:26|cuda-0 | Resetting device - cu 00:37:27|cuda-0 | Allocating light with size: 16907456 - cu 00:37:27|cuda-0 | Generating mining buffers - cu 00:37:27|cuda-0 | Generating DAG for GPU # 0 with dagBytes: 1082130304 gridSize: 1024 - cu 00:37:30|cuda-0 | Finished DAG - cu 00:37:31|cuda-0 | Compile log: - cu 00:37:31|cuda-0 | JIT info: - ptxas info : 202 bytes gmem, 96 bytes cmem[3] -ptxas info : Compiling entry function '_Z14progpow_searchy8hash32_tyPK5dag_tPV14search_resultsb' for 'sm_61' -ptxas info : Function properties for _Z14progpow_searchy8hash32_tyPK5dag_tPV14search_resultsb -ptxas . 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 158 registers, 16384 bytes smem, 385 bytes cmem[0], 4 bytes cmem[2] -ptxas info : Function properties for _Z11keccak_f8008hash32_tyS_ -ptxas . 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads - cu 00:37:31|cuda-0 | JIT err: - - cu 00:37:31|cuda-0 | Mangled name: _Z14progpow_searchy8hash32_tyPK5dag_tPV14search_resultsb - cu 00:37:31|cuda-0 | done compiling -header ffeeddccbbaa9988776655443322110000112233445566778899aabbccddeeff -nonce 123456789abcdef0 -seed ee304846ddd0a47b -cdag[0]=b3e35467 cdag[1]=ae7402e3 cdag[2]=8522a782 cdag[3]=a2d8353b -cdag[4]=ff4723bd cdag[5]=bfbc05ee cdag[6]=de6944de cdag[7]=f0d2b5b8 -cdag[8]=c74cbad3 cdag[9]=b100f797 cdag[10]=05bc60be cdag[11]=4f40840b -cdag[12]=35e47268 cdag[13]=9cd6f993 cdag[14]=6a0e4659 cdag[15]=b838e46e -cdag[4080]=bde0c650 cdag[4081]=57cba482 cdag[4082]=54877c9d cdag[4083]=f9fdc423 -cdag[4084]=fb65141b cdag[4085]=55074ca4 cdag[4086]=c7dd116e cdag[4087]=bc1737d1 -cdag[4088]=126e8847 cdag[4089]=b16983b2 cdag[4090]=f80c058e cdag[4091]=e0ad53b5 -cdag[4092]=d5f3e840 cdag[4093]=ff1bdd89 cdag[4094]=35660a19 cdag[4095]=73244193 -fill_mix lane 0 mix[0]=10c02f0d ... mix[31]=6d175b7e -fill_mix lane 1 mix[0]=f301da0a ... mix[31]=be60fc28 -fill_mix lane 2 mix[0]=81f99723 ... mix[31]=5cbc32d1 -fill_mix lane 3 mix[0]=35e2204b ... mix[31]=4951b6e6 -fill_mix lane 4 mix[0]=1e276b09 ... mix[31]=70db3fa7 -fill_mix lane 5 mix[0]=eb343d86 ... mix[31]=0ae9ecfa -fill_mix lane 6 mix[0]=ff34cf47 ... mix[31]=e2cd3093 -fill_mix lane 7 mix[0]=c7056ce8 ... mix[31]=4c362d17 -fill_mix lane 8 mix[0]=bd42a7b6 ... mix[31]=7da69107 -fill_mix lane 9 mix[0]=95d03571 ... mix[31]=7d22f89e -fill_mix lane 10 mix[0]=af0e74dc ... mix[31]=12f2d96c -fill_mix lane 11 mix[0]=1287d683 ... mix[31]=1931a478 -fill_mix lane 12 mix[0]=96f960c2 ... mix[31]=6f5b9f23 -fill_mix lane 13 mix[0]=4e46d05d ... mix[31]=3837dda3 -fill_mix lane 14 mix[0]=1a9b1d40 ... mix[31]=08b4fc39 -fill_mix lane 15 mix[0]=3344ce0f ... mix[31]=20201012 -loop 0 dag entry 2043727 of 4227071 total -loop 1 dag entry 1878577 of 4227071 total -loop 2 dag entry 1972818 of 4227071 total -loop 3 dag entry 4192557 of 4227071 total -loop 4 dag entry 2908963 of 4227071 total -loop 5 dag entry 650106 of 4227071 total -loop 6 dag entry 3360110 of 4227071 total -loop 7 dag entry 2666972 of 4227071 total -loop 8 dag entry 3262571 of 4227071 total -loop 9 dag entry 1876031 of 4227071 total -loop 10 dag entry 1099946 of 4227071 total -loop 11 dag entry 1058639 of 4227071 total -loop 12 dag entry 4091582 of 4227071 total -loop 13 dag entry 2295331 of 4227071 total -loop 14 dag entry 2587683 of 4227071 total -loop 15 dag entry 950942 of 4227071 total -loop 16 dag entry 2427766 of 4227071 total -loop 17 dag entry 677253 of 4227071 total -loop 18 dag entry 3564299 of 4227071 total -loop 19 dag entry 2373221 of 4227071 total -loop 20 dag entry 2065878 of 4227071 total -loop 21 dag entry 2684534 of 4227071 total -loop 22 dag entry 3563556 of 4227071 total -loop 23 dag entry 909053 of 4227071 total -loop 24 dag entry 3867986 of 4227071 total -loop 25 dag entry 959685 of 4227071 total -loop 26 dag entry 2837635 of 4227071 total -loop 27 dag entry 3312470 of 4227071 total -loop 28 dag entry 3048893 of 4227071 total -loop 29 dag entry 3601694 of 4227071 total -loop 30 dag entry 3536836 of 4227071 total -loop 31 dag entry 533409 of 4227071 total -loop 32 dag entry 3736438 of 4227071 total -loop 33 dag entry 864961 of 4227071 total -loop 34 dag entry 188397 of 4227071 total -loop 35 dag entry 3814381 of 4227071 total -loop 36 dag entry 4108296 of 4227071 total -loop 37 dag entry 3950694 of 4227071 total -loop 38 dag entry 2069968 of 4227071 total -loop 39 dag entry 2745630 of 4227071 total -loop 40 dag entry 1008990 of 4227071 total -loop 41 dag entry 2675149 of 4227071 total -loop 42 dag entry 3352224 of 4227071 total -loop 43 dag entry 472995 of 4227071 total -loop 44 dag entry 3348733 of 4227071 total -loop 45 dag entry 2693084 of 4227071 total -loop 46 dag entry 3406906 of 4227071 total -loop 47 dag entry 2629222 of 4227071 total -loop 48 dag entry 911095 of 4227071 total -loop 49 dag entry 1358231 of 4227071 total -loop 50 dag entry 3574453 of 4227071 total -loop 51 dag entry 4109829 of 4227071 total -loop 52 dag entry 3503254 of 4227071 total -loop 53 dag entry 3897141 of 4227071 total -loop 54 dag entry 4030761 of 4227071 total -loop 55 dag entry 2855865 of 4227071 total -loop 56 dag entry 452603 of 4227071 total -loop 57 dag entry 1258735 of 4227071 total -loop 58 dag entry 2380407 of 4227071 total -loop 59 dag entry 3320172 of 4227071 total -loop 60 dag entry 3180940 of 4227071 total -loop 61 dag entry 1407113 of 4227071 total -loop 62 dag entry 1057525 of 4227071 total -loop 63 dag entry 574671 of 4227071 total -digest lane 0: 5883883e -digest lane 1: 2fb0fd2e -digest lane 2: eadb7563 -digest lane 3: 4a171075 -digest lane 4: ac2758f5 -digest lane 5: aa5b06cf -digest lane 6: 52156e93 -digest lane 7: 4f7a7fff -digest lane 8: fe91e36a -digest lane 9: 9964c8b6 -digest lane 10: 6a3d93e2 -digest lane 11: 3c6d641f -digest lane 12: e90da618 -digest lane 13: 80cd8ab9 -digest lane 14: ce72386f -digest lane 15: 95517d28 -digest: 11f19805c58ab46610ff9c719dcf0a5f18fa2f1605798eef770c47219274767d -result (top 64 bits): 5b7ccd472dbefdd9 \ No newline at end of file From 122aee1a265144bd4e54dbd8a64e3a42dc522a50 Mon Sep 17 00:00:00 2001 From: AndreaLanfranchi Date: Thu, 30 Apr 2020 18:04:12 +0200 Subject: [PATCH 19/19] Update documentation 0.9.4 --- README.md | 10 +++++----- test-vectors.md | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 76d10ad8d..886a2e0f2 100644 --- a/README.md +++ b/README.md @@ -550,11 +550,11 @@ Additional test vectors can be found [in the test vectors file](test-vectors.md# ## Change History - 0.9.4 (current) - Patch the [bypass memory hardness](https://github.com/ifdefelse/ProgPOW/issues/51) vulnerability. -- [0.9.3](https://medium.com/@ifdefelse/progpow-progress-da5bb31a651b) - Reduce parameters PERIOD, CNT_MATH, and CNT_CACHE. -- [0.9.2](https://github.com/ifdefelse/ProgPOW/blob/0e39b62deb0c9ab14900fc404fcb19cac70240e1/README.md) - Unique sources for math() and prevent rotation by 0 in merge(). Suggested by [SChernykh](https://github.com/ifdefelse/ProgPOW/issues/19) -- [0.9.1](https://github.com/ifdefelse/ProgPOW/blob/60bba1c3fdad6a54539fc3e9f05727547de9c58c/README.md) - Shuffle what part of the DAG entry each lane accesses. Suggested by [mbevand](https://github.com/ifdefelse/ProgPOW/pull/13) -- [0.9.0](https://github.com/ifdefelse/ProgPOW/blob/a3f62349a1513f0393524683f9671cfe17cca895/README.md) - Unique cache address sources, re-tune parameters -- [0.8.0](https://github.com/ifdefelse/ProgPOW/blob/620b4c7aafe60391f863372814d7517e94386379/README.md) - Original spec +- [0.9.3](https://github.com/ifdefelse/ProgPOW/tree/spec-0.9.3) - Reduce parameters PERIOD, CNT_MATH, and CNT_CACHE. See [this medium post](https://medium.com/@ifdefelse/progpow-progress-da5bb31a651b) for details. +- [0.9.2](https://github.com/ifdefelse/ProgPOW/tree/spec-0.9.2) - Unique sources for math() and prevent rotation by 0 in merge(). Suggested by [SChernykh](https://github.com/ifdefelse/ProgPOW/issues/19) +- [0.9.1](https://github.com/ifdefelse/ProgPOW/tree/spec-0.9.1) - Shuffle what part of the DAG entry each lane accesses. Suggested by [mbevand](https://github.com/ifdefelse/ProgPOW/pull/13) +- [0.9.0](https://github.com/ifdefelse/ProgPOW/tree/spec-0.9.0) - Unique cache address sources, re-tune parameters +- [0.8.0](https://github.com/ifdefelse/ProgPOW/tree/spec-0.8.0) - Original spec ## License diff --git a/test-vectors.md b/test-vectors.md index 2bc303f62..8b9dd92df 100644 --- a/test-vectors.md +++ b/test-vectors.md @@ -177,4 +177,3 @@ Mix seed : 0x60e3237f12819dc6 Mix hash : 0x67ed6cc54b89262d1ac9c3f24ad0d2362cb703c1e23881713d0350ca3035e2ae Hash final : 0xad8fa791bfb2f474a3487d27075bf339d73e0e69d62ecab14166add94c8d0f92 ``` -