diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c2b7ce6..22770355 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,7 +14,7 @@ restore-workspace: &restore-workspace restore-cache: &restore-cache restore_cache: keys: - - cargo-v0-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }} + - cargo-v1-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }} - repo-source-{{ .Branch }}-{{ .Revision }} commands: @@ -35,13 +35,22 @@ commands: no_output_timeout: 5m - run: name: Test (pairing, GPU) (<< parameters.target >>) - command: TARGET=<< parameters.target >> cargo test --release --features gpu -- --test-threads=1 + command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features pairing,gpu -- --test-threads=1 no_output_timeout: 30m - run: name: Test (blst, GPU) (<< parameters.target >>) command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features blst,gpu -- --test-threads=1 no_output_timeout: 30m + - run: + name: Test (pairing, opencl) (<< parameters.target >>) + command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features pairing,opencl -- --test-threads=1 + no_output_timeout: 30m + + - run: + name: Test (blst, opencl) (<< parameters.target >>) + command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features blst,opencl -- --test-threads=1 + no_output_timeout: 30m jobs: cargo_fetch: @@ -63,12 +72,12 @@ jobs: command: cargo generate-lockfile - restore_cache: keys: - - cargo-v0-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }} + - cargo-v1-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }} - run: cargo update - run: cargo fetch - run: rustup install $(cat rust-toolchain) - run: rustup default $(cat rust-toolchain) - - run: rustup install nightly + - run: rustup install nightly-2020-11-18 - run: rustup component add rustfmt-preview - run: rustup component add clippy-preview - run: rustc --version @@ -78,7 +87,7 @@ jobs: paths: - gpuci - save_cache: - key: cargo-v0-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }} + key: cargo-v1-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }} paths: - "~/.cargo" - "~/.rustup" @@ -175,10 +184,10 @@ jobs: - run: sudo apt install -y ocl-icd-opencl-dev - run: name: Run cargo release build (pairing, gpu) - command: cargo +nightly build -Zpackage-features --release -p gbench --no-default-features --features pairing,gpu + command: cargo +nightly-2020-11-18 build -Zpackage-features --release -p gbench --no-default-features --features pairing,gpu - run: name: Run cargo release build (blst, gpu) - command: cargo +nightly build -Zpackage-features --release -p gbench --no-default-features --features blst,gpu + command: cargo +nightly-2020-11-18 build -Zpackage-features --release -p gbench --no-default-features --features blst,gpu benches: executor: default diff --git a/CHANGELOG.md b/CHANGELOG.md index fe9f08de..ca6eb597 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://book.async.rs/overview/stability-guarantees.html). ## Unreleased +- Pure OpenCL implementation of batch hashing. (https://github.com/filecoin-project/neptune/pull/78) ## 2.4.0 - 2020-11-17 diff --git a/Cargo.toml b/Cargo.toml index dc78f7fd..d2b118c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,12 +11,15 @@ repository = "https://github.com/filecoin-project/neptune" lazy_static = "1.4.0" bellperson = { version = "0.12", default-features = false } blake2s_simd = "0.5" +blstrs = { version = "0.2.0", optional = true } byteorder = "1" ff = { version = "0.2.1", package = "fff" } generic-array = "0.14.4" log = "0.4.8" rust-gpu-tools = { version = "0.3.0", optional = true } triton = { version = "2.1.0", package = "neptune-triton", default-features = false, features = ["opencl"], optional = true } +itertools = { version = "0.8.0" } +ff-cl-gen = "0.2.0" [dev-dependencies] criterion = "0.3" @@ -45,6 +48,7 @@ codegen-units = 1 [features] default = ["pairing"] gpu = ["triton", "rust-gpu-tools"] +opencl = ["rust-gpu-tools"] pairing = ["bellperson/pairing"] blst = ["bellperson/blst"] diff --git a/README.md b/README.md index 10cd5a1b..898768af 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,14 @@ proofs (in SNARKs). Neptune also supports batch hashing and tree building, which can be performed on a GPU. The underlying GPU implementation, [neptune-triton](https://github.com/filecoin-project/neptune-triton) is implemented in the [Futhark -Programming Language](https://futhark-lang.org/). +Programming Language](https://futhark-lang.org/). To use `neptune-triton` GPU batch hashing, compile `neptune` with the +`gpu` feature. + +Neptune now implements GPU batch hashing in pure OpenCL. The initial implementation is a bit less than 2x faster than +the Futhark implementation, so once stabilized this will likely be the preferred option. The pure OpenCL batch hashing +is provided by the internal `proteus` module. To use `proteus`, compile `neptune` with the `opencl` feature. + +The `gpu` and `opencl` features are mutually exclusive. At the time of the 1.0.0 release, Neptune on RTX 2080Ti GPU can build 8-ary Merkle trees for 4GiB of input in 16 seconds. @@ -35,7 +42,7 @@ The following are likely areas of future work: - [x] Support for multiple GPUs. - [x] Support domain separation tag. -- [ ] Improve throughput (?) by using OpenCL directly. +- [x] Improve throughput (?) by using OpenCL directly. ## History diff --git a/gbench/Cargo.toml b/gbench/Cargo.toml index e0297ddf..184d71b7 100644 --- a/gbench/Cargo.toml +++ b/gbench/Cargo.toml @@ -15,12 +15,13 @@ env_logger = "0.7.1" ff = { version = "0.2.1", package = "fff" } generic-array = "0.14.4" log = "0.4.8" -neptune = { path = "../", default-features = false, features=["gpu"] } +neptune = { path = "../", default-features = false } rust-gpu-tools = { version = "0.3.0", optional = true } structopt = { version = "0.3", default-features = false } [features] default = ["pairing", "gpu"] gpu = ["neptune/gpu", "rust-gpu-tools"] +opencl = ["neptune/opencl", "rust-gpu-tools"] pairing = ["neptune/pairing", "bellperson/pairing"] blst = ["neptune/blst", "bellperson/blst"] diff --git a/gbench/src/main.rs b/gbench/src/main.rs index 26c3b70b..2418f3d6 100644 --- a/gbench/src/main.rs +++ b/gbench/src/main.rs @@ -107,7 +107,7 @@ struct Opts { } fn main() -> Result<(), Error> { - #[cfg(all(feature = "gpu", target_os = "macos"))] + #[cfg(all(any(feature = "gpu", feature = "opencl"), target_os = "macos"))] unimplemented!("Running on macos is not recommended and may have bad consequences -- experiment at your own risk."); env_logger::init(); @@ -127,6 +127,13 @@ fn main() -> Result<(), Error> { // Comma separated list of GPU bus-ids let gpus = std::env::var("NEPTUNE_GBENCH_GPUS"); + + #[cfg(feature = "gpu")] + let default_type = BatcherType::GPU; + + #[cfg(feature = "opencl")] + let default_type = BatcherType::OpenCL; + let batcher_types = gpus .map(|v| { v.split(",") @@ -134,7 +141,8 @@ fn main() -> Result<(), Error> { .map(|bus_id| BatcherType::CustomGPU(GPUSelector::BusId(bus_id))) .collect::>() }) - .unwrap_or(vec![BatcherType::GPU]); + .unwrap_or(vec![default_type]); + let mut threads = Vec::new(); for batcher_type in batcher_types { threads.push(thread::spawn(move || { diff --git a/src/batch_hasher.rs b/src/batch_hasher.rs index e42fd684..32f86273 100644 --- a/src/batch_hasher.rs +++ b/src/batch_hasher.rs @@ -1,11 +1,14 @@ +use rust_gpu_tools::opencl; use std::fmt::{self, Debug}; use std::marker::PhantomData; use std::sync::{Arc, Mutex}; -#[cfg(feature = "gpu")] -use crate::cl; use crate::error::Error; use crate::poseidon::SimplePoseidonBatchHasher; +#[cfg(feature = "opencl")] +use crate::proteus::gpu::{get_device, CLBatchHasher}; +#[cfg(feature = "gpu")] +use crate::triton::cl; use crate::{Arity, BatchHasher, Strength, DEFAULT_STRENGTH}; use bellperson::bls::Fr; use generic_array::GenericArray; @@ -16,34 +19,50 @@ use triton::FutharkContext; #[derive(Clone)] pub enum BatcherType { - #[cfg(feature = "gpu")] + #[cfg(any(feature = "gpu", feature = "opencl"))] CustomGPU(GPUSelector), #[cfg(feature = "gpu")] FromFutharkContext(Arc>), + #[cfg(feature = "opencl")] + FromDevice(opencl::Device), + #[cfg(feature = "gpu")] GPU, CPU, + #[cfg(feature = "opencl")] + OpenCL, } impl Debug for BatcherType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_fmt(format_args!("BatcherType::"))?; match self { + #[cfg(feature = "gpu")] BatcherType::FromFutharkContext(_) => f.write_fmt(format_args!("FromFutharkContext")), + #[cfg(feature = "opencl")] + BatcherType::FromDevice(_) => f.write_fmt(format_args!("FromDevice")), + #[cfg(any(feature = "gpu", feature = "opencl"))] BatcherType::CustomGPU(x) => f.write_fmt(format_args!("CustomGPU({:?})", x)), BatcherType::CPU => f.write_fmt(format_args!("CPU")), + #[cfg(feature = "gpu")] BatcherType::GPU => f.write_fmt(format_args!("GPU")), + #[cfg(feature = "opencl")] + BatcherType::OpenCL => f.write_fmt(format_args!("OpenCL")), } } } -use crate::gpu::GPUBatchHasher; +#[cfg(feature = "gpu")] +use crate::triton::gpu::GPUBatchHasher; pub enum Batcher where A: Arity, { + #[cfg(feature = "gpu")] GPU(GPUBatchHasher), CPU(SimplePoseidonBatchHasher), + #[cfg(feature = "opencl")] + OpenCL(CLBatchHasher), } impl Batcher @@ -52,8 +71,11 @@ where { pub(crate) fn t(&self) -> BatcherType { match self { + #[cfg(feature = "gpu")] Batcher::GPU(_) => BatcherType::GPU, Batcher::CPU(_) => BatcherType::CPU, + #[cfg(feature = "opencl")] + Batcher::OpenCL(_) => BatcherType::OpenCL, } } @@ -67,6 +89,9 @@ where max_batch_size: usize, ) -> Result { match t { + BatcherType::CPU => Ok(Batcher::CPU( + SimplePoseidonBatchHasher::::new_with_strength(strength, max_batch_size)?, + )), #[cfg(feature = "gpu")] BatcherType::GPU => Ok(Batcher::GPU(GPUBatchHasher::::new_with_strength( cl::default_futhark_context()?, @@ -81,9 +106,6 @@ where max_batch_size, )?)) } - BatcherType::CPU => Ok(Batcher::CPU( - SimplePoseidonBatchHasher::::new_with_strength(strength, max_batch_size)?, - )), #[cfg(feature = "gpu")] BatcherType::FromFutharkContext(futhark_context) => { Ok(Batcher::GPU(GPUBatchHasher::::new_with_strength( @@ -92,6 +114,24 @@ where max_batch_size, )?)) } + #[cfg(feature = "opencl")] + BatcherType::OpenCL => Ok(Batcher::OpenCL(CLBatchHasher::::new_with_strength( + get_device(&GPUSelector::Index(0))?, + strength, + max_batch_size, + )?)), + #[cfg(feature = "opencl")] + BatcherType::CustomGPU(selector) => { + Ok(Batcher::OpenCL(CLBatchHasher::::new_with_strength( + get_device(selector)?, + strength, + max_batch_size, + )?)) + } + #[cfg(feature = "opencl")] + BatcherType::FromDevice(device) => Ok(Batcher::OpenCL( + CLBatchHasher::::new_with_strength(&device, strength, max_batch_size)?, + )), } } @@ -102,6 +142,14 @@ where _ => None, } } + + #[cfg(feature = "opencl")] + pub(crate) fn device(&self) -> Option { + match self { + Batcher::OpenCL(b) => Some(b.device()), + _ => None, + } + } } impl BatchHasher for Batcher @@ -110,42 +158,21 @@ where { fn hash(&mut self, preimages: &[GenericArray]) -> Result, Error> { match self { - Batcher::GPU(batcher) => batcher.hash(preimages), Batcher::CPU(batcher) => batcher.hash(preimages), + #[cfg(feature = "gpu")] + Batcher::GPU(batcher) => batcher.hash(preimages), + #[cfg(feature = "opencl")] + Batcher::OpenCL(batcher) => batcher.hash(preimages), } } fn max_batch_size(&self) -> usize { match self { - Batcher::GPU(batcher) => batcher.max_batch_size(), Batcher::CPU(batcher) => batcher.max_batch_size(), + #[cfg(feature = "gpu")] + Batcher::GPU(batcher) => batcher.max_batch_size(), + #[cfg(feature = "opencl")] + Batcher::OpenCL(batcher) => batcher.max_batch_size(), } } } - -// /// NoGPUBatchHasher is a dummy required so we can build with the gpu flag even on platforms on which we cannot currently -// /// run with GPU. -pub struct NoGPUBatchHasher(PhantomData); - -impl BatchHasher for NoGPUBatchHasher -where - A: Arity, -{ - fn hash(&mut self, _preimages: &[GenericArray]) -> Result, Error> { - unimplemented!(); - } - - fn max_batch_size(&self) -> usize { - unimplemented!(); - } -} - -#[cfg(feature = "gpu")] -impl NoGPUBatchHasher -where - A: Arity, -{ - fn futhark_context(&self) -> Arc> { - unimplemented!() - } -} diff --git a/src/column_tree_builder.rs b/src/column_tree_builder.rs index 0097a3c0..fc8bbde7 100644 --- a/src/column_tree_builder.rs +++ b/src/column_tree_builder.rs @@ -120,16 +120,27 @@ where let tree_builder = match { match &column_batcher { + #[cfg(feature = "gpu")] Some(b) => b.futhark_context(), + #[cfg(feature = "opencl")] + Some(b) => b.device(), None => None, } } { + #[cfg(feature = "gpu")] Some(ctx) => TreeBuilder::::new( Some(BatcherType::FromFutharkContext(ctx)), leaf_count, max_tree_batch_size, 0, )?, + #[cfg(feature = "opencl")] + Some(device) => TreeBuilder::::new( + Some(BatcherType::FromDevice(device)), + leaf_count, + max_tree_batch_size, + 0, + )?, None => TreeBuilder::::new(t, leaf_count, max_tree_batch_size, 0)?, }; @@ -162,7 +173,7 @@ where } } -#[cfg(all(feature = "gpu", not(target_os = "macos")))] +#[cfg(all(any(feature = "gpu", feature = "opencl"), not(target_os = "macos")))] #[cfg(test)] mod tests { use super::*; @@ -181,6 +192,9 @@ mod tests { #[cfg(feature = "gpu")] test_column_tree_builder_aux(Some(BatcherType::GPU), 512, 32, 512, 512); + + #[cfg(feature = "opencl")] + test_column_tree_builder_aux(Some(BatcherType::OpenCL), 512, 32, 512, 512); } fn test_column_tree_builder_aux( diff --git a/src/error.rs b/src/error.rs index 6b54f343..f90233d2 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,7 +1,47 @@ #[cfg(feature = "gpu")] -use crate::cl; +use crate::triton::cl; use std::{error, fmt}; +#[derive(Debug, Clone)] +#[cfg(any(feature = "gpu", feature = "opencl"))] +pub enum ClError { + DeviceNotFound, + PlatformNotFound, + BusIdNotAvailable, + NvidiaBusIdNotAvailable, + AmdTopologyNotAvailable, + PlatformNameNotAvailable, + CannotCreateContext, + CannotCreateQueue, + GetDeviceError, +} + +#[cfg(any(feature = "gpu", feature = "opencl"))] +pub type ClResult = std::result::Result; + +#[cfg(any(feature = "gpu", feature = "opencl"))] +impl fmt::Display for ClError { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + match self { + ClError::DeviceNotFound => write!(f, "Device not found."), + ClError::PlatformNotFound => write!(f, "Platform not found."), + ClError::BusIdNotAvailable => write!(f, "Cannot extract bus-id for the given device."), + ClError::NvidiaBusIdNotAvailable => { + write!(f, "Cannot extract bus-id for the given Nvidia device.") + } + ClError::AmdTopologyNotAvailable => { + write!(f, "Cannot extract bus-id for the given AMD device.") + } + ClError::PlatformNameNotAvailable => { + write!(f, "Cannot extract platform name for the given platform.") + } + ClError::CannotCreateContext => write!(f, "Cannot create cl_context."), + ClError::CannotCreateQueue => write!(f, "Cannot create cl_command_queue."), + ClError::GetDeviceError => write!(f, "Cannot get Device"), + } + } +} + #[derive(Debug, Clone)] /// Possible error states for the hashing. pub enum Error { @@ -11,8 +51,8 @@ pub enum Error { IndexOutOfBounds, /// The provided leaf was not found in the tree GPUError(String), - #[cfg(feature = "gpu")] - ClError(cl::ClError), + #[cfg(any(feature = "gpu", feature = "opencl"))] + ClError(ClError), #[cfg(feature = "gpu")] TritonError(String), DecodingError, @@ -20,8 +60,8 @@ pub enum Error { } #[cfg(feature = "gpu")] -impl From for Error { - fn from(e: cl::ClError) -> Self { +impl From for Error { + fn from(e: ClError) -> Self { Self::ClError(e) } } @@ -44,7 +84,7 @@ impl fmt::Display for Error { ), Error::IndexOutOfBounds => write!(f, "The referenced index is outs of bounds."), Error::GPUError(s) => write!(f, "GPU Error: {}", s), - #[cfg(feature = "gpu")] + #[cfg(any(feature = "gpu", feature = "opencl"))] Error::ClError(e) => write!(f, "OpenCL Error: {}", e), #[cfg(feature = "gpu")] Error::TritonError(e) => write!(f, "Neptune-triton Error: {}", e), diff --git a/src/lib.rs b/src/lib.rs index 2b1d15fd..51d503d2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,9 @@ pub use error::Error; use ff::{Field, PrimeField, ScalarEngine}; use generic_array::GenericArray; +#[cfg(all(feature = "gpu", feature = "opencl"))] +compile_error!("gpu and opencl features are mutually exclusive"); + /// Poseidon circuit pub mod circuit; pub mod error; @@ -27,23 +30,23 @@ mod round_constants; pub mod hash_type; /// Tree Builder -#[cfg(feature = "gpu")] +#[cfg(any(feature = "gpu", feature = "opencl"))] pub mod tree_builder; /// Column Tree Builder -#[cfg(feature = "gpu")] +#[cfg(any(feature = "gpu", feature = "opencl"))] pub mod column_tree_builder; #[cfg(feature = "gpu")] -mod gpu; - -#[cfg(feature = "gpu")] -pub mod cl; +pub mod triton; /// Batch Hasher -#[cfg(feature = "gpu")] +#[cfg(any(feature = "gpu", feature = "opencl"))] pub mod batch_hasher; +#[cfg(feature = "opencl")] +pub mod proteus; + pub(crate) const TEST_SEED: [u8; 16] = [ 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, 0xe5, ]; diff --git a/src/poseidon.rs b/src/poseidon.rs index 79baec6c..0a039162 100644 --- a/src/poseidon.rs +++ b/src/poseidon.rs @@ -452,7 +452,6 @@ where ) { element.add_assign(round_constant); } - self.constants_offset += self.elements.len(); } diff --git a/src/proteus/cl/poseidon.cl b/src/proteus/cl/poseidon.cl new file mode 100644 index 00000000..58356c02 --- /dev/null +++ b/src/proteus/cl/poseidon.cl @@ -0,0 +1,178 @@ +typedef struct state {{ + {field} elements[{width}]; + int current_round; + int rk_offset; +}} state; + +void debug_f({field} f) {{ + {field}_print({field}_unmont(f)); + printf("\n"); +}} +void debug(state s) {{ + if (get_global_id(0) == 0) {{ + printf("state: "); + for (int i = 0; i < {width}; ++i) {{ + {field} x = s.elements[i]; + debug_f(x); + }} + printf("\n"); + }} +}} + +void debug_vec(__constant {field} v[], int size) {{ + if (get_global_id(0) == 0) {{ + for (int i = 0; i < size; ++i) {{ + {field} x = v[i]; + debug_f(x); + }} + printf("\n"); + }} +}} + +{field} quintic_s_box({field} l, {field} pre_add, {field} post_add) {{ + {field} tmp = {field}_add(l, pre_add); + tmp = {field}_sqr(l); + tmp = {field}_sqr(tmp); + tmp = {field}_mul(tmp, l); + tmp = {field}_add(tmp, post_add); + + return tmp; + }} + +state add_round_key(__constant {field} constants[{constants_elements}], state s, int i) {{ + s.elements[i] = {field}_add(s.elements[i], (constants + {round_keys_offset})[s.rk_offset + i]); + return s; +}} + +state apply_matrix (__constant {field} matrix[{width}][{width}], state s) {{ + {field} tmp[{width}]; + for (int i = 0; i < {width}; ++i) {{ + tmp[i] = s.elements[i]; + s.elements[i] = {field}_ZERO; + }} + + int size = {width}*{width}; + for (int j = 0; j < {width}; ++j) {{ + for (int i = 0; i < {width}; ++i) {{ + s.elements[j] = {field}_add(s.elements[j], {field}_mul(matrix[i][j], tmp[i])); + }} + }} + return s; + }} + +{field} scalar_product(__constant {field}* a, {field}* b, int size) {{ + {field} res = {field}_ZERO; + + for (int i = 0; i < size; ++i) {{ + {field} tmp = {field}_mul(a[i], b[i]); + res = {field}_add(res, tmp); + }} + + return res; + }} + +state apply_sparse_matrix (__constant {field} sm[{sparse_matrix_size}], state s) {{ + {field} first_elt = s.elements[0]; + + s.elements[0] = scalar_product(sm + {w_hat_offset}, s.elements, {width}); + + for (int i = 1; i < {width}; ++i) {{ + {field} val = {field}_mul((sm + {v_rest_offset})[i-1], first_elt); + s.elements[i] = {field}_add(s.elements[i], val); + }} + + return s; + }} + +state apply_round_matrix (__constant {field} constants[{constants_elements}], state s) {{ + if (s.current_round == {sparse_offset}) {{ + s = apply_matrix(constants + {pre_sparse_matrix_offset}, s); + }} else if ((s.current_round > {sparse_offset}) && (s.current_round < {full_half} + {partial_rounds})) {{ + int index = s.current_round - {sparse_offset} - 1; + s = apply_sparse_matrix(constants + {sparse_matrixes_offset} + (index * {sparse_matrix_size}), s); + }} else {{ + s = apply_matrix(constants + {mds_matrix_offset}, s); + }} + return s; + }} + +state add_full_round_keys (__constant {field} constants[{constants_elements}], state s) {{ + for (int i = 0; i < {width}; ++i) {{ + s = add_round_key(constants, s, i); + }} + s.rk_offset += {width}; + return s; + }} + +state add_partial_round_key (__constant {field} constants[{constants_elements}], state s) {{ + s = add_round_key(constants, s, 0); + s.rk_offset += 1; + return s; +}} + +state full_round (__constant {field} constants[{constants_elements}], state s) {{ + for (int i = 0; i < {width}; ++i) {{ + s.elements[i] = quintic_s_box(s.elements[i], {field}_ZERO, (constants + {round_keys_offset})[s.rk_offset + i]); + }} + s.rk_offset += {width}; + s = apply_round_matrix(constants, s); + s.current_round += 1; + return s; +}} + +state last_full_round (__constant {field} constants[{constants_elements}], state s) {{ + for (int i = 0; i < {width}; ++i) {{ + s.elements[i] = quintic_s_box(s.elements[i], {field}_ZERO, {field}_ZERO); + }} + s = apply_round_matrix(constants, s); + return s; +}} + +state partial_round (__constant {field} constants[{constants_elements}], state s) {{ + s.elements[0] = quintic_s_box(s.elements[0], {field}_ZERO, (constants + {round_keys_offset})[s.rk_offset]); + s.rk_offset += 1; + s = apply_round_matrix(constants, s); + s.current_round += 1; + return s; +}} + +state hash (__constant {field} constants[{constants_elements}], state s) {{ + s = add_full_round_keys(constants, s); + + for (int i = 0; i < {full_half}; ++i) {{ + s = full_round(constants, s); + }} + for (int i = 0; i < {partial_rounds}; ++ i) {{ + s = partial_round(constants, s); + }} + for (int i = 0; i < ({full_half} - 1); ++ i) {{ + s = full_round(constants, s); + }} + s = last_full_round(constants, s); + + return s; + }} + +__kernel void hash_preimages(__constant {field} constants[{constants_elements}], + __global {field} *preimages, + __global {field} *digests, + int batch_size + ) {{ + int global_id = get_global_id(0); + + if (global_id < batch_size) {{ + int offset = global_id * {arity}; + + state s; + s.elements[0] = constants[{domain_tag_offset}]; + for (int i = 0; i < {arity}; ++i) {{ + s.elements[i+1] = preimages[offset + i]; + }} + s.current_round = 0; + s.rk_offset = 0; + + s = hash(constants, s); + + digests[global_id] = s.elements[1]; + }} + }} diff --git a/src/proteus/gpu.rs b/src/proteus/gpu.rs new file mode 100644 index 00000000..48c28976 --- /dev/null +++ b/src/proteus/gpu.rs @@ -0,0 +1,318 @@ +use super::sources::generate_program; +use crate::error::{ClError, Error}; +use crate::hash_type::HashType; +use crate::poseidon::PoseidonConstants; +use crate::{Arity, BatchHasher, Strength, DEFAULT_STRENGTH}; +use bellperson::bls::{Bls12, Fr, FrRepr}; +use ff::{Field, PrimeField, PrimeFieldDecodingError}; +use generic_array::{typenum, ArrayLength, GenericArray}; +use log::info; +use rust_gpu_tools::opencl::{cl_device_id, Device, GPUSelector}; +use rust_gpu_tools::{call_kernel, opencl}; +use std::collections::HashMap; +use std::marker::PhantomData; +use typenum::{U11, U2, U8}; + +#[derive(Debug)] +struct GPUConstants(PoseidonConstants) +where + A: Arity; + +pub struct CLBatchHasher +where + A: Arity, +{ + device: opencl::Device, + constants: GPUConstants, + constants_buffer: opencl::Buffer, + max_batch_size: usize, + program: opencl::Program, +} + +pub struct DerivedConstants { + pub arity: usize, + pub partial_rounds: usize, + pub width: usize, + pub sparse_matrix_size: usize, + pub full_half: usize, + pub sparse_offset: usize, + pub constants_elements: usize, + + // Offsets + pub domain_tag_offset: usize, + pub round_keys_offset: usize, + pub mds_matrix_offset: usize, + pub pre_sparse_matrix_offset: usize, + pub sparse_matrixes_offset: usize, + pub w_hat_offset: usize, + pub v_rest_offset: usize, +} + +impl GPUConstants +where + A: Arity, +{ + fn derived_constants(&self) -> DerivedConstants { + let c = &self.0; + let arity = c.arity(); + let full_rounds = c.full_rounds; + let partial_rounds = c.partial_rounds; + let sparse_count = partial_rounds; + let width = arity + 1; + let sparse_matrix_size = 2 * width - 1; + let rk_count = width * full_rounds + partial_rounds; + let full_half = full_rounds / 2; + let sparse_offset = full_half - 1; + let constants_elements = + 1 + rk_count + (width * width) + (width * width) + (sparse_count * sparse_matrix_size); + + let matrix_size = width * width; + let mut offset = 0; + let domain_tag_offset = offset; + offset += 1; + let round_keys_offset = offset; + offset += rk_count; + let mds_matrix_offset = offset; + offset += matrix_size; + let pre_sparse_matrix_offset = offset; + offset += matrix_size; + let sparse_matrixes_offset = offset; + + let w_hat_offset = 0; + let v_rest_offset = width; + + DerivedConstants { + arity, + partial_rounds, + width, + sparse_matrix_size, + full_half, + sparse_offset, + constants_elements, + domain_tag_offset, + round_keys_offset, + mds_matrix_offset, + pre_sparse_matrix_offset, + sparse_matrixes_offset, + w_hat_offset, + v_rest_offset, + } + } +} + +impl GPUConstants +where + A: Arity, +{ + fn full_rounds(&self) -> usize { + self.0.full_rounds + } + + fn partial_rounds(&self) -> usize { + self.0.partial_rounds + } + + fn to_buffer(&self, program: &opencl::Program) -> Result, Error> { + let DerivedConstants { + arity: _, + partial_rounds: _, + width: _, + sparse_matrix_size: _, + full_half: _, + sparse_offset: _, + constants_elements, + domain_tag_offset, + round_keys_offset, + mds_matrix_offset, + pre_sparse_matrix_offset, + sparse_matrixes_offset, + w_hat_offset: _, + v_rest_offset: _, + } = self.derived_constants(); + + let mut buffer = program + .create_buffer::(constants_elements) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + + let c = &self.0; + + buffer + .write_from(domain_tag_offset, &[c.domain_tag]) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + buffer + .write_from(round_keys_offset, &c.compressed_round_constants) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + buffer + .write_from( + mds_matrix_offset, + c.mds_matrices + .m + .iter() + .flatten() + .cloned() + .collect::>() + .as_slice(), + ) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + buffer + .write_from( + pre_sparse_matrix_offset, + c.pre_sparse_matrix + .iter() + .flatten() + .cloned() + .collect::>() + .as_slice(), + ) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + let mut sm_elts = Vec::new(); + for sm in c.sparse_matrixes.iter() { + sm_elts.extend(sm.w_hat.iter()); + sm_elts.extend(sm.v_rest.iter()); + } + buffer + .write_from(sparse_matrixes_offset, &sm_elts) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + + Ok(buffer) + } +} + +pub fn get_device(selector: &GPUSelector) -> Result<&'static opencl::Device, Error> { + if let Some(device) = selector.get_device() { + info!("device: {:?}", device); + Ok(device) + } else { + return Err(Error::ClError(ClError::BusIdNotAvailable)); + } +} + +impl CLBatchHasher +where + A: Arity, +{ + /// Create a new `GPUBatchHasher` and initialize it with state corresponding with its `A`. + pub(crate) fn new(selector: &GPUSelector, max_batch_size: usize) -> Result { + let device = get_device(selector)?; + Self::new_with_strength(device, DEFAULT_STRENGTH, max_batch_size) + } + + pub(crate) fn new_with_strength( + device: &opencl::Device, + strength: Strength, + max_batch_size: usize, + ) -> Result { + let constants = GPUConstants(PoseidonConstants::::new_with_strength(strength)); + let src = generate_program::(true, constants.derived_constants()); + let program = opencl::Program::from_opencl(device.clone(), &src) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + let constants_buffer = constants.to_buffer(&program)?; + Ok(Self { + device: device.clone(), + constants, + constants_buffer, + max_batch_size, + program, + }) + } + + pub(crate) fn device(&self) -> opencl::Device { + self.device.clone() + } +} +const LOCAL_WORK_SIZE: usize = 256; +impl BatchHasher for CLBatchHasher +where + A: Arity, +{ + fn hash(&mut self, preimages: &[GenericArray]) -> Result, Error> { + let local_work_size = LOCAL_WORK_SIZE; + let max_batch_size = self.max_batch_size; + let batch_size = preimages.len(); + assert!(batch_size <= max_batch_size); + + // Set `global_work_size` to smallest multiple of `local_work_size` >= `batch-size`. + let global_work_size = ((batch_size / local_work_size) + + (batch_size % local_work_size != 0) as usize) + * local_work_size; + + let num_hashes = preimages.len(); + + let kernel = + self.program + .create_kernel("hash_preimages", global_work_size, Some(local_work_size)); + + let mut preimages_buffer = self + .program + .create_buffer::>(num_hashes) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + + preimages_buffer + .write_from(0, preimages) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + let result_buffer = self + .program + .create_buffer::(num_hashes) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + + call_kernel!( + kernel, + &self.constants_buffer, + &preimages_buffer, + &result_buffer, + preimages.len() as i32 + ) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + + let mut frs = vec![::zero(); num_hashes]; + result_buffer + .read_into(0, &mut frs) + .map_err(|e| Error::GPUError(format!("{:?}", e)))?; + Ok(frs.to_vec()) + } + + fn max_batch_size(&self) -> usize { + self.max_batch_size + } +} + +#[cfg(test)] +#[cfg(all(feature = "opencl", not(target_os = "macos")))] +mod test { + use super::*; + use crate::poseidon::{Poseidon, SimplePoseidonBatchHasher}; + use generic_array::sequence::GenericSequence; + use rand::SeedableRng; + use rand_xorshift::XorShiftRng; + + #[test] + fn test_batch_hash_2() { + let mut rng = XorShiftRng::from_seed(crate::TEST_SEED); + let device = get_device(&GPUSelector::Index(0)).unwrap(); + + // NOTE: `batch_size` is not a multiple of `LOCAL_WORK_SIZE`. + let batch_size = 1025; + + let mut cl_hasher = + CLBatchHasher::::new_with_strength(device, Strength::Standard, batch_size).unwrap(); + let mut simple_hasher = + SimplePoseidonBatchHasher::::new_with_strength(Strength::Standard, batch_size) + .unwrap(); + + let preimages = (0..batch_size) + .map(|_| GenericArray::::generate(|_| Fr::random(&mut rng))) + .collect::>(); + + let cl_hashes = cl_hasher.hash(&preimages).unwrap(); + let expected_hashes: Vec<_> = simple_hasher.hash(&preimages).unwrap(); + + dbg!( + &cl_hashes, + &expected_hashes, + &cl_hashes.len(), + &expected_hashes.len() + ); + + assert_eq!(expected_hashes, cl_hashes); + } +} diff --git a/src/proteus/mod.rs b/src/proteus/mod.rs new file mode 100644 index 00000000..b2c115b8 --- /dev/null +++ b/src/proteus/mod.rs @@ -0,0 +1,2 @@ +pub mod gpu; +pub mod sources; diff --git a/src/proteus/sources.rs b/src/proteus/sources.rs new file mode 100644 index 00000000..358f97f9 --- /dev/null +++ b/src/proteus/sources.rs @@ -0,0 +1,65 @@ +use super::gpu::DerivedConstants; +use bellperson::bls::{Engine, Fr}; +use ff::PrimeField; +use itertools::join; + +fn config() -> String { + "".to_string() +} + +fn poseidon_source(field: &str, derived_constants: DerivedConstants) -> String { + let DerivedConstants { + arity, + partial_rounds, + width, + sparse_matrix_size, + full_half, + sparse_offset, + constants_elements, + domain_tag_offset, + round_keys_offset, + mds_matrix_offset, + pre_sparse_matrix_offset, + sparse_matrixes_offset, + w_hat_offset, + v_rest_offset, + } = derived_constants; + + format!( + include_str!("cl/poseidon.cl"), + arity = arity, + field = field, + partial_rounds = partial_rounds, + width = width, + sparse_matrix_size = sparse_matrix_size, + full_half = full_half, + sparse_offset = sparse_offset, + constants_elements = constants_elements, + domain_tag_offset = domain_tag_offset, + round_keys_offset = round_keys_offset, + mds_matrix_offset = mds_matrix_offset, + pre_sparse_matrix_offset = pre_sparse_matrix_offset, + w_hat_offset = w_hat_offset, + v_rest_offset = v_rest_offset, + sparse_matrixes_offset = sparse_matrixes_offset, + ) +} + +pub fn generate_program(limb64: bool, derived_constants: DerivedConstants) -> String +where + Fr: PrimeField, +{ + let field_source = if limb64 { + ff_cl_gen::field::("Fr") + } else { + ff_cl_gen::field::("Fr") + }; + join( + &[ + config(), + field_source, + poseidon_source("Fr", derived_constants), + ], + "\n", + ) +} diff --git a/src/tree_builder.rs b/src/tree_builder.rs index 044f3437..18f10c6e 100644 --- a/src/tree_builder.rs +++ b/src/tree_builder.rs @@ -247,7 +247,7 @@ where } } -#[cfg(all(feature = "gpu", not(target_os = "macos")))] +#[cfg(all(any(feature = "gpu", feature = "opencl"), not(target_os = "macos")))] #[cfg(test)] mod tests { use super::*; @@ -263,6 +263,9 @@ mod tests { #[cfg(all(feature = "gpu", not(target_os = "macos")))] test_tree_builder_aux(Some(BatcherType::GPU), 512, 32, 512, 512); + + #[cfg(all(feature = "opencl", not(target_os = "macos")))] + test_tree_builder_aux(Some(BatcherType::OpenCL), 512, 32, 512, 512); } fn test_tree_builder_aux( diff --git a/src/cl.rs b/src/triton/cl.rs similarity index 69% rename from src/cl.rs rename to src/triton/cl.rs index 423b96cb..c5cf187e 100644 --- a/src/cl.rs +++ b/src/triton/cl.rs @@ -1,3 +1,4 @@ +use crate::error::{ClError, ClResult}; use log::*; use rust_gpu_tools::opencl::{cl_device_id, Device, GPUSelector}; use std::collections::HashMap; @@ -8,57 +9,11 @@ use triton::bindings; use triton::FutharkContext; const MAX_LEN: usize = 128; -#[repr(C)] -#[derive(Debug, Clone, Default)] -struct cl_amd_device_topology { - r#type: u32, - unused: [u8; 17], - bus: u8, - device: u8, - function: u8, -} - lazy_static! { pub static ref FUTHARK_CONTEXT_MAP: RwLock>>> = RwLock::new(HashMap::new()); } -#[derive(Debug, Clone)] -pub enum ClError { - DeviceNotFound, - PlatformNotFound, - BusIdNotAvailable, - NvidiaBusIdNotAvailable, - AmdTopologyNotAvailable, - PlatformNameNotAvailable, - CannotCreateContext, - CannotCreateQueue, - GetDeviceError, -} -pub type ClResult = std::result::Result; - -impl fmt::Display for ClError { - fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - match self { - ClError::DeviceNotFound => write!(f, "Device not found."), - ClError::PlatformNotFound => write!(f, "Platform not found."), - ClError::BusIdNotAvailable => write!(f, "Cannot extract bus-id for the given device."), - ClError::NvidiaBusIdNotAvailable => { - write!(f, "Cannot extract bus-id for the given Nvidia device.") - } - ClError::AmdTopologyNotAvailable => { - write!(f, "Cannot extract bus-id for the given AMD device.") - } - ClError::PlatformNameNotAvailable => { - write!(f, "Cannot extract platform name for the given platform.") - } - ClError::CannotCreateContext => write!(f, "Cannot create cl_context."), - ClError::CannotCreateQueue => write!(f, "Cannot create cl_command_queue."), - ClError::GetDeviceError => write!(f, "Cannot get Device"), - } - } -} - fn create_context(device: bindings::cl_device_id) -> ClResult { let mut res = 0i32; let context = unsafe { diff --git a/src/gpu.rs b/src/triton/gpu.rs similarity index 99% rename from src/gpu.rs rename to src/triton/gpu.rs index cd4fef29..63aad589 100644 --- a/src/gpu.rs +++ b/src/triton/gpu.rs @@ -1,4 +1,4 @@ -use crate::cl; +use super::cl; use crate::error::Error; use crate::hash_type::HashType; use crate::poseidon::PoseidonConstants; @@ -630,8 +630,8 @@ fn u64_vec<'a, U: ArrayLength>(vec: &'a [GenericArray]) -> Vec { #[cfg(all(feature = "gpu", not(target_os = "macos")))] mod tests { use super::*; - use crate::gpu::BatcherState; use crate::poseidon::{Poseidon, SimplePoseidonBatchHasher}; + use crate::triton::gpu::BatcherState; use crate::BatchHasher; use ff::{Field, ScalarEngine}; use generic_array::sequence::GenericSequence; @@ -667,6 +667,7 @@ mod tests { let (hashes, _) = mbatch_hash2(&mut ctx.lock().unwrap(), &mut state, preimages.as_slice()).unwrap(); + let gpu_hashes = gpu_hasher.hash(&preimages).unwrap(); let expected_hashes: Vec<_> = simple_hasher.hash(&preimages).unwrap(); diff --git a/src/triton/mod.rs b/src/triton/mod.rs new file mode 100644 index 00000000..8bd7fdd8 --- /dev/null +++ b/src/triton/mod.rs @@ -0,0 +1,2 @@ +pub mod cl; +pub mod gpu;