Merge pull request #78 from filecoin-project/feat/opencl

Implement pure OpenCL batch hashing.
argumentcomputer · Jan 20, 2021 · 7f90f5b · 7f90f5b
2 parents 6a4071c + 569db95
commit 7f90f5b
Show file tree

Hide file tree

Showing 19 changed files with 749 additions and 112 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -14,7 +14,7 @@ restore-workspace: &restore-workspace
 restore-cache: &restore-cache
   restore_cache:
     keys:
-      - cargo-v0-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
+      - cargo-v1-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
       - repo-source-{{ .Branch }}-{{ .Revision }}
 
 commands:
@@ -35,13 +35,22 @@ commands:
           no_output_timeout: 5m
       - run:
           name: Test (pairing, GPU) (<< parameters.target >>)
-          command: TARGET=<< parameters.target >> cargo test --release --features gpu  -- --test-threads=1
+          command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features pairing,gpu -- --test-threads=1
           no_output_timeout: 30m
 
       - run:
           name: Test (blst, GPU) (<< parameters.target >>)
           command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features blst,gpu -- --test-threads=1
           no_output_timeout: 30m
+      - run:
+          name: Test (pairing, opencl) (<< parameters.target >>)
+          command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features pairing,opencl -- --test-threads=1
+          no_output_timeout: 30m
+
+      - run:
+          name: Test (blst, opencl) (<< parameters.target >>)
+          command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features blst,opencl -- --test-threads=1
+          no_output_timeout: 30m
 
 jobs:
   cargo_fetch:
@@ -63,12 +72,12 @@ jobs:
           command: cargo generate-lockfile
       - restore_cache:
           keys:
-            - cargo-v0-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
+            - cargo-v1-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
       - run: cargo update
       - run: cargo fetch
       - run: rustup install $(cat rust-toolchain)
       - run: rustup default $(cat rust-toolchain)
-      - run: rustup install nightly
+      - run: rustup install nightly-2020-11-18
       - run: rustup component add rustfmt-preview
       - run: rustup component add clippy-preview
       - run: rustc --version
@@ -78,7 +87,7 @@ jobs:
           paths:
             - gpuci
       - save_cache:
-          key: cargo-v0-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
+          key: cargo-v1-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
           paths:
             - "~/.cargo"
             - "~/.rustup"
@@ -175,10 +184,10 @@ jobs:
       - run: sudo apt install -y ocl-icd-opencl-dev
       - run:
           name: Run cargo release build (pairing, gpu)
-          command: cargo +nightly build -Zpackage-features --release -p gbench --no-default-features --features pairing,gpu
+          command: cargo +nightly-2020-11-18 build -Zpackage-features --release -p gbench --no-default-features --features pairing,gpu
       - run:
           name: Run cargo release build (blst, gpu)
-          command: cargo +nightly build -Zpackage-features --release -p gbench --no-default-features --features blst,gpu
+          command: cargo +nightly-2020-11-18 build -Zpackage-features --release -p gbench --no-default-features --features blst,gpu
 
   benches:
     executor: default

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://book.async.rs/overview/stability-guarantees.html).
 
 ## Unreleased
+- Pure OpenCL implementation of batch hashing. (https://github.com/filecoin-project/neptune/pull/78)
 
 ## 2.4.0 - 2020-11-17
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -11,12 +11,15 @@ repository = "https://github.com/filecoin-project/neptune"
 lazy_static = "1.4.0"
 bellperson = { version = "0.12", default-features = false }
 blake2s_simd = "0.5"
+blstrs = { version = "0.2.0", optional = true }
 byteorder = "1"
 ff = { version = "0.2.1", package = "fff" }
 generic-array = "0.14.4"
 log = "0.4.8"
 rust-gpu-tools = { version = "0.3.0", optional = true }
 triton = { version = "2.1.0", package = "neptune-triton", default-features = false, features = ["opencl"], optional = true }
+itertools = { version = "0.8.0" }
+ff-cl-gen = "0.2.0"
 
 [dev-dependencies]
 criterion = "0.3"
@@ -45,6 +48,7 @@ codegen-units = 1
 [features]
 default = ["pairing"]
 gpu = ["triton", "rust-gpu-tools"]
+opencl = ["rust-gpu-tools"]
 pairing = ["bellperson/pairing"]
 blst = ["bellperson/blst"]
 

diff --git a/README.md b/README.md
@@ -19,7 +19,14 @@ proofs (in SNARKs).
 
 Neptune also supports batch hashing and tree building, which can be performed on a GPU. The underlying GPU
 implementation, [neptune-triton](https://github.com/filecoin-project/neptune-triton) is implemented in the [Futhark
-Programming Language](https://futhark-lang.org/).
+Programming Language](https://futhark-lang.org/). To use `neptune-triton` GPU batch hashing, compile `neptune` with the
+`gpu` feature.
+
+Neptune now implements GPU batch hashing in pure OpenCL. The initial implementation is a bit less than 2x faster than
+the Futhark implementation, so once stabilized this will likely be the preferred option. The pure OpenCL batch hashing
+is provided by the internal `proteus` module. To use `proteus`, compile `neptune` with the `opencl` feature.
+
+The `gpu` and `opencl` features are mutually exclusive.
 
 At the time of the 1.0.0 release, Neptune on RTX 2080Ti GPU can build 8-ary Merkle trees for 4GiB of input in 16 seconds.
 
@@ -35,7 +42,7 @@ The following are likely areas of future work:
 
 - [x] Support for multiple GPUs.
 - [x] Support domain separation tag.
-- [ ] Improve throughput (?) by using OpenCL directly.
+- [x] Improve throughput (?) by using OpenCL directly.
 
 ## History
 

diff --git a/gbench/Cargo.toml b/gbench/Cargo.toml
@@ -15,12 +15,13 @@ env_logger = "0.7.1"
 ff = { version = "0.2.1", package = "fff" }
 generic-array = "0.14.4"
 log = "0.4.8"
-neptune = { path = "../", default-features = false, features=["gpu"] }
+neptune = { path = "../", default-features = false }
 rust-gpu-tools = { version = "0.3.0", optional = true }
 structopt = { version = "0.3", default-features = false }
 
 [features]
 default = ["pairing", "gpu"]
 gpu = ["neptune/gpu", "rust-gpu-tools"]
+opencl = ["neptune/opencl", "rust-gpu-tools"]
 pairing = ["neptune/pairing", "bellperson/pairing"]
 blst = ["neptune/blst", "bellperson/blst"]
diff --git a/gbench/src/main.rs b/gbench/src/main.rs
@@ -107,7 +107,7 @@ struct Opts {
 }
 
 fn main() -> Result<(), Error> {
-    #[cfg(all(feature = "gpu", target_os = "macos"))]
+    #[cfg(all(any(feature = "gpu", feature = "opencl"), target_os = "macos"))]
     unimplemented!("Running on macos is not recommended and may have bad consequences -- experiment at your own risk.");
     env_logger::init();
 
@@ -127,14 +127,22 @@ fn main() -> Result<(), Error> {
 
     // Comma separated list of GPU bus-ids
     let gpus = std::env::var("NEPTUNE_GBENCH_GPUS");
+
+    #[cfg(feature = "gpu")]
+    let default_type = BatcherType::GPU;
+
+    #[cfg(feature = "opencl")]
+    let default_type = BatcherType::OpenCL;
+
     let batcher_types = gpus
         .map(|v| {
             v.split(",")
                 .map(|s| s.parse::<u32>().expect("Invalid Bus-Id number!"))
                 .map(|bus_id| BatcherType::CustomGPU(GPUSelector::BusId(bus_id)))
                 .collect::<Vec<_>>()
         })
-        .unwrap_or(vec![BatcherType::GPU]);
+        .unwrap_or(vec![default_type]);
+
     let mut threads = Vec::new();
     for batcher_type in batcher_types {
         threads.push(thread::spawn(move || {

diff --git a/src/batch_hasher.rs b/src/batch_hasher.rs
@@ -1,11 +1,14 @@
+use rust_gpu_tools::opencl;
 use std::fmt::{self, Debug};
 use std::marker::PhantomData;
 use std::sync::{Arc, Mutex};
 
-#[cfg(feature = "gpu")]
-use crate::cl;
 use crate::error::Error;
 use crate::poseidon::SimplePoseidonBatchHasher;
+#[cfg(feature = "opencl")]
+use crate::proteus::gpu::{get_device, CLBatchHasher};
+#[cfg(feature = "gpu")]
+use crate::triton::cl;
 use crate::{Arity, BatchHasher, Strength, DEFAULT_STRENGTH};
 use bellperson::bls::Fr;
 use generic_array::GenericArray;
@@ -16,34 +19,50 @@ use triton::FutharkContext;
 
 #[derive(Clone)]
 pub enum BatcherType {
-    #[cfg(feature = "gpu")]
+    #[cfg(any(feature = "gpu", feature = "opencl"))]
     CustomGPU(GPUSelector),
     #[cfg(feature = "gpu")]
     FromFutharkContext(Arc<Mutex<FutharkContext>>),
+    #[cfg(feature = "opencl")]
+    FromDevice(opencl::Device),
+    #[cfg(feature = "gpu")]
     GPU,
     CPU,
+    #[cfg(feature = "opencl")]
+    OpenCL,
 }
 
 impl Debug for BatcherType {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.write_fmt(format_args!("BatcherType::"))?;
         match self {
+            #[cfg(feature = "gpu")]
             BatcherType::FromFutharkContext(_) => f.write_fmt(format_args!("FromFutharkContext")),
+            #[cfg(feature = "opencl")]
+            BatcherType::FromDevice(_) => f.write_fmt(format_args!("FromDevice")),
+            #[cfg(any(feature = "gpu", feature = "opencl"))]
             BatcherType::CustomGPU(x) => f.write_fmt(format_args!("CustomGPU({:?})", x)),
             BatcherType::CPU => f.write_fmt(format_args!("CPU")),
+            #[cfg(feature = "gpu")]
             BatcherType::GPU => f.write_fmt(format_args!("GPU")),
+            #[cfg(feature = "opencl")]
+            BatcherType::OpenCL => f.write_fmt(format_args!("OpenCL")),
         }
     }
 }
 
-use crate::gpu::GPUBatchHasher;
+#[cfg(feature = "gpu")]
+use crate::triton::gpu::GPUBatchHasher;
 
 pub enum Batcher<A>
 where
     A: Arity<Fr>,
 {
+    #[cfg(feature = "gpu")]
     GPU(GPUBatchHasher<A>),
     CPU(SimplePoseidonBatchHasher<A>),
+    #[cfg(feature = "opencl")]
+    OpenCL(CLBatchHasher<A>),
 }
 
 impl<A> Batcher<A>
@@ -52,8 +71,11 @@ where
 {
     pub(crate) fn t(&self) -> BatcherType {
         match self {
+            #[cfg(feature = "gpu")]
             Batcher::GPU(_) => BatcherType::GPU,
             Batcher::CPU(_) => BatcherType::CPU,
+            #[cfg(feature = "opencl")]
+            Batcher::OpenCL(_) => BatcherType::OpenCL,
         }
     }
 
@@ -67,6 +89,9 @@ where
         max_batch_size: usize,
     ) -> Result<Self, Error> {
         match t {
+            BatcherType::CPU => Ok(Batcher::CPU(
+                SimplePoseidonBatchHasher::<A>::new_with_strength(strength, max_batch_size)?,
+            )),
             #[cfg(feature = "gpu")]
             BatcherType::GPU => Ok(Batcher::GPU(GPUBatchHasher::<A>::new_with_strength(
                 cl::default_futhark_context()?,
@@ -81,9 +106,6 @@ where
                     max_batch_size,
                 )?))
             }
-            BatcherType::CPU => Ok(Batcher::CPU(
-                SimplePoseidonBatchHasher::<A>::new_with_strength(strength, max_batch_size)?,
-            )),
             #[cfg(feature = "gpu")]
             BatcherType::FromFutharkContext(futhark_context) => {
                 Ok(Batcher::GPU(GPUBatchHasher::<A>::new_with_strength(
@@ -92,6 +114,24 @@ where
                     max_batch_size,
                 )?))
             }
+            #[cfg(feature = "opencl")]
+            BatcherType::OpenCL => Ok(Batcher::OpenCL(CLBatchHasher::<A>::new_with_strength(
+                get_device(&GPUSelector::Index(0))?,
+                strength,
+                max_batch_size,
+            )?)),
+            #[cfg(feature = "opencl")]
+            BatcherType::CustomGPU(selector) => {
+                Ok(Batcher::OpenCL(CLBatchHasher::<A>::new_with_strength(
+                    get_device(selector)?,
+                    strength,
+                    max_batch_size,
+                )?))
+            }
+            #[cfg(feature = "opencl")]
+            BatcherType::FromDevice(device) => Ok(Batcher::OpenCL(
+                CLBatchHasher::<A>::new_with_strength(&device, strength, max_batch_size)?,
+            )),
         }
     }
 
@@ -102,6 +142,14 @@ where
             _ => None,
         }
     }
+
+    #[cfg(feature = "opencl")]
+    pub(crate) fn device(&self) -> Option<opencl::Device> {
+        match self {
+            Batcher::OpenCL(b) => Some(b.device()),
+            _ => None,
+        }
+    }
 }
 
 impl<A> BatchHasher<A> for Batcher<A>
@@ -110,42 +158,21 @@ where
 {
     fn hash(&mut self, preimages: &[GenericArray<Fr, A>]) -> Result<Vec<Fr>, Error> {
         match self {
-            Batcher::GPU(batcher) => batcher.hash(preimages),
             Batcher::CPU(batcher) => batcher.hash(preimages),
+            #[cfg(feature = "gpu")]
+            Batcher::GPU(batcher) => batcher.hash(preimages),
+            #[cfg(feature = "opencl")]
+            Batcher::OpenCL(batcher) => batcher.hash(preimages),
         }
     }
 
     fn max_batch_size(&self) -> usize {
         match self {
-            Batcher::GPU(batcher) => batcher.max_batch_size(),
             Batcher::CPU(batcher) => batcher.max_batch_size(),
+            #[cfg(feature = "gpu")]
+            Batcher::GPU(batcher) => batcher.max_batch_size(),
+            #[cfg(feature = "opencl")]
+            Batcher::OpenCL(batcher) => batcher.max_batch_size(),
         }
     }
 }
-
-// /// NoGPUBatchHasher is a dummy required so we can build with the gpu flag even on platforms on which we cannot currently
-// /// run with GPU.
-pub struct NoGPUBatchHasher<A>(PhantomData<A>);
-
-impl<A> BatchHasher<A> for NoGPUBatchHasher<A>
-where
-    A: Arity<Fr>,
-{
-    fn hash(&mut self, _preimages: &[GenericArray<Fr, A>]) -> Result<Vec<Fr>, Error> {
-        unimplemented!();
-    }
-
-    fn max_batch_size(&self) -> usize {
-        unimplemented!();
-    }
-}
-
-#[cfg(feature = "gpu")]
-impl<A> NoGPUBatchHasher<A>
-where
-    A: Arity<Fr>,
-{
-    fn futhark_context(&self) -> Arc<Mutex<FutharkContext>> {
-        unimplemented!()
-    }
-}