diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9c2b7ce6..22770355 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -14,7 +14,7 @@ restore-workspace: &restore-workspace
 restore-cache: &restore-cache
   restore_cache:
     keys:
-      - cargo-v0-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
+      - cargo-v1-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
       - repo-source-{{ .Branch }}-{{ .Revision }}
 
 commands:
@@ -35,13 +35,22 @@ commands:
           no_output_timeout: 5m
       - run:
           name: Test (pairing, GPU) (<< parameters.target >>)
-          command: TARGET=<< parameters.target >> cargo test --release --features gpu  -- --test-threads=1
+          command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features pairing,gpu -- --test-threads=1
           no_output_timeout: 30m
 
       - run:
           name: Test (blst, GPU) (<< parameters.target >>)
           command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features blst,gpu -- --test-threads=1
           no_output_timeout: 30m
+      - run:
+          name: Test (pairing, opencl) (<< parameters.target >>)
+          command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features pairing,opencl -- --test-threads=1
+          no_output_timeout: 30m
+
+      - run:
+          name: Test (blst, opencl) (<< parameters.target >>)
+          command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features blst,opencl -- --test-threads=1
+          no_output_timeout: 30m
 
 jobs:
   cargo_fetch:
@@ -63,12 +72,12 @@ jobs:
           command: cargo generate-lockfile
       - restore_cache:
           keys:
-            - cargo-v0-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
+            - cargo-v1-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
       - run: cargo update
       - run: cargo fetch
       - run: rustup install $(cat rust-toolchain)
       - run: rustup default $(cat rust-toolchain)
-      - run: rustup install nightly
+      - run: rustup install nightly-2020-11-18
       - run: rustup component add rustfmt-preview
       - run: rustup component add clippy-preview
       - run: rustc --version
@@ -78,7 +87,7 @@ jobs:
           paths:
             - gpuci
       - save_cache:
-          key: cargo-v0-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
+          key: cargo-v1-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
           paths:
             - "~/.cargo"
             - "~/.rustup"
@@ -175,10 +184,10 @@ jobs:
       - run: sudo apt install -y ocl-icd-opencl-dev
       - run:
           name: Run cargo release build (pairing, gpu)
-          command: cargo +nightly build -Zpackage-features --release -p gbench --no-default-features --features pairing,gpu
+          command: cargo +nightly-2020-11-18 build -Zpackage-features --release -p gbench --no-default-features --features pairing,gpu
       - run:
           name: Run cargo release build (blst, gpu)
-          command: cargo +nightly build -Zpackage-features --release -p gbench --no-default-features --features blst,gpu
+          command: cargo +nightly-2020-11-18 build -Zpackage-features --release -p gbench --no-default-features --features blst,gpu
 
   benches:
     executor: default
diff --git a/CHANGELOG.md b/CHANGELOG.md
index fe9f08de..ca6eb597 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://book.async.rs/overview/stability-guarantees.html).
 
 ## Unreleased
+- Pure OpenCL implementation of batch hashing. (https://github.com/filecoin-project/neptune/pull/78)
 
 ## 2.4.0 - 2020-11-17
 
diff --git a/Cargo.toml b/Cargo.toml
index dc78f7fd..d2b118c8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,12 +11,15 @@ repository = "https://github.com/filecoin-project/neptune"
 lazy_static = "1.4.0"
 bellperson = { version = "0.12", default-features = false }
 blake2s_simd = "0.5"
+blstrs = { version = "0.2.0", optional = true }
 byteorder = "1"
 ff = { version = "0.2.1", package = "fff" }
 generic-array = "0.14.4"
 log = "0.4.8"
 rust-gpu-tools = { version = "0.3.0", optional = true }
 triton = { version = "2.1.0", package = "neptune-triton", default-features = false, features = ["opencl"], optional = true }
+itertools = { version = "0.8.0" }
+ff-cl-gen = "0.2.0"
 
 [dev-dependencies]
 criterion = "0.3"
@@ -45,6 +48,7 @@ codegen-units = 1
 [features]
 default = ["pairing"]
 gpu = ["triton", "rust-gpu-tools"]
+opencl = ["rust-gpu-tools"]
 pairing = ["bellperson/pairing"]
 blst = ["bellperson/blst"]
 
diff --git a/README.md b/README.md
index 10cd5a1b..898768af 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,14 @@ proofs (in SNARKs).
 
 Neptune also supports batch hashing and tree building, which can be performed on a GPU. The underlying GPU
 implementation, [neptune-triton](https://github.com/filecoin-project/neptune-triton) is implemented in the [Futhark
-Programming Language](https://futhark-lang.org/).
+Programming Language](https://futhark-lang.org/). To use `neptune-triton` GPU batch hashing, compile `neptune` with the
+`gpu` feature.
+
+Neptune now implements GPU batch hashing in pure OpenCL. The initial implementation is a bit less than 2x faster than
+the Futhark implementation, so once stabilized this will likely be the preferred option. The pure OpenCL batch hashing
+is provided by the internal `proteus` module. To use `proteus`, compile `neptune` with the `opencl` feature.
+
+The `gpu` and `opencl` features are mutually exclusive.
 
 At the time of the 1.0.0 release, Neptune on RTX 2080Ti GPU can build 8-ary Merkle trees for 4GiB of input in 16 seconds.
 
@@ -35,7 +42,7 @@ The following are likely areas of future work:
 
 - [x] Support for multiple GPUs.
 - [x] Support domain separation tag.
-- [ ] Improve throughput (?) by using OpenCL directly.
+- [x] Improve throughput (?) by using OpenCL directly.
 
 ## History
 
diff --git a/gbench/Cargo.toml b/gbench/Cargo.toml
index e0297ddf..184d71b7 100644
--- a/gbench/Cargo.toml
+++ b/gbench/Cargo.toml
@@ -15,12 +15,13 @@ env_logger = "0.7.1"
 ff = { version = "0.2.1", package = "fff" }
 generic-array = "0.14.4"
 log = "0.4.8"
-neptune = { path = "../", default-features = false, features=["gpu"] }
+neptune = { path = "../", default-features = false }
 rust-gpu-tools = { version = "0.3.0", optional = true }
 structopt = { version = "0.3", default-features = false }
 
 [features]
 default = ["pairing", "gpu"]
 gpu = ["neptune/gpu", "rust-gpu-tools"]
+opencl = ["neptune/opencl", "rust-gpu-tools"]
 pairing = ["neptune/pairing", "bellperson/pairing"]
 blst = ["neptune/blst", "bellperson/blst"]
diff --git a/gbench/src/main.rs b/gbench/src/main.rs
index 26c3b70b..2418f3d6 100644
--- a/gbench/src/main.rs
+++ b/gbench/src/main.rs
@@ -107,7 +107,7 @@ struct Opts {
 }
 
 fn main() -> Result<(), Error> {
-    #[cfg(all(feature = "gpu", target_os = "macos"))]
+    #[cfg(all(any(feature = "gpu", feature = "opencl"), target_os = "macos"))]
     unimplemented!("Running on macos is not recommended and may have bad consequences -- experiment at your own risk.");
     env_logger::init();
 
@@ -127,6 +127,13 @@ fn main() -> Result<(), Error> {
 
     // Comma separated list of GPU bus-ids
     let gpus = std::env::var("NEPTUNE_GBENCH_GPUS");
+
+    #[cfg(feature = "gpu")]
+    let default_type = BatcherType::GPU;
+
+    #[cfg(feature = "opencl")]
+    let default_type = BatcherType::OpenCL;
+
     let batcher_types = gpus
         .map(|v| {
             v.split(",")
@@ -134,7 +141,8 @@ fn main() -> Result<(), Error> {
                 .map(|bus_id| BatcherType::CustomGPU(GPUSelector::BusId(bus_id)))
                 .collect::<Vec<_>>()
         })
-        .unwrap_or(vec![BatcherType::GPU]);
+        .unwrap_or(vec![default_type]);
+
     let mut threads = Vec::new();
     for batcher_type in batcher_types {
         threads.push(thread::spawn(move || {
diff --git a/src/batch_hasher.rs b/src/batch_hasher.rs
index e42fd684..32f86273 100644
--- a/src/batch_hasher.rs
+++ b/src/batch_hasher.rs
@@ -1,11 +1,14 @@
+use rust_gpu_tools::opencl;
 use std::fmt::{self, Debug};
 use std::marker::PhantomData;
 use std::sync::{Arc, Mutex};
 
-#[cfg(feature = "gpu")]
-use crate::cl;
 use crate::error::Error;
 use crate::poseidon::SimplePoseidonBatchHasher;
+#[cfg(feature = "opencl")]
+use crate::proteus::gpu::{get_device, CLBatchHasher};
+#[cfg(feature = "gpu")]
+use crate::triton::cl;
 use crate::{Arity, BatchHasher, Strength, DEFAULT_STRENGTH};
 use bellperson::bls::Fr;
 use generic_array::GenericArray;
@@ -16,34 +19,50 @@ use triton::FutharkContext;
 
 #[derive(Clone)]
 pub enum BatcherType {
-    #[cfg(feature = "gpu")]
+    #[cfg(any(feature = "gpu", feature = "opencl"))]
     CustomGPU(GPUSelector),
     #[cfg(feature = "gpu")]
     FromFutharkContext(Arc<Mutex<FutharkContext>>),
+    #[cfg(feature = "opencl")]
+    FromDevice(opencl::Device),
+    #[cfg(feature = "gpu")]
     GPU,
     CPU,
+    #[cfg(feature = "opencl")]
+    OpenCL,
 }
 
 impl Debug for BatcherType {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.write_fmt(format_args!("BatcherType::"))?;
         match self {
+            #[cfg(feature = "gpu")]
             BatcherType::FromFutharkContext(_) => f.write_fmt(format_args!("FromFutharkContext")),
+            #[cfg(feature = "opencl")]
+            BatcherType::FromDevice(_) => f.write_fmt(format_args!("FromDevice")),
+            #[cfg(any(feature = "gpu", feature = "opencl"))]
             BatcherType::CustomGPU(x) => f.write_fmt(format_args!("CustomGPU({:?})", x)),
             BatcherType::CPU => f.write_fmt(format_args!("CPU")),
+            #[cfg(feature = "gpu")]
             BatcherType::GPU => f.write_fmt(format_args!("GPU")),
+            #[cfg(feature = "opencl")]
+            BatcherType::OpenCL => f.write_fmt(format_args!("OpenCL")),
         }
     }
 }
 
-use crate::gpu::GPUBatchHasher;
+#[cfg(feature = "gpu")]
+use crate::triton::gpu::GPUBatchHasher;
 
 pub enum Batcher<A>
 where
     A: Arity<Fr>,
 {
+    #[cfg(feature = "gpu")]
     GPU(GPUBatchHasher<A>),
     CPU(SimplePoseidonBatchHasher<A>),
+    #[cfg(feature = "opencl")]
+    OpenCL(CLBatchHasher<A>),
 }
 
 impl<A> Batcher<A>
@@ -52,8 +71,11 @@ where
 {
     pub(crate) fn t(&self) -> BatcherType {
         match self {
+            #[cfg(feature = "gpu")]
             Batcher::GPU(_) => BatcherType::GPU,
             Batcher::CPU(_) => BatcherType::CPU,
+            #[cfg(feature = "opencl")]
+            Batcher::OpenCL(_) => BatcherType::OpenCL,
         }
     }
 
@@ -67,6 +89,9 @@ where
         max_batch_size: usize,
     ) -> Result<Self, Error> {
         match t {
+            BatcherType::CPU => Ok(Batcher::CPU(
+                SimplePoseidonBatchHasher::<A>::new_with_strength(strength, max_batch_size)?,
+            )),
             #[cfg(feature = "gpu")]
             BatcherType::GPU => Ok(Batcher::GPU(GPUBatchHasher::<A>::new_with_strength(
                 cl::default_futhark_context()?,
@@ -81,9 +106,6 @@ where
                     max_batch_size,
                 )?))
             }
-            BatcherType::CPU => Ok(Batcher::CPU(
-                SimplePoseidonBatchHasher::<A>::new_with_strength(strength, max_batch_size)?,
-            )),
             #[cfg(feature = "gpu")]
             BatcherType::FromFutharkContext(futhark_context) => {
                 Ok(Batcher::GPU(GPUBatchHasher::<A>::new_with_strength(
@@ -92,6 +114,24 @@ where
                     max_batch_size,
                 )?))
             }
+            #[cfg(feature = "opencl")]
+            BatcherType::OpenCL => Ok(Batcher::OpenCL(CLBatchHasher::<A>::new_with_strength(
+                get_device(&GPUSelector::Index(0))?,
+                strength,
+                max_batch_size,
+            )?)),
+            #[cfg(feature = "opencl")]
+            BatcherType::CustomGPU(selector) => {
+                Ok(Batcher::OpenCL(CLBatchHasher::<A>::new_with_strength(
+                    get_device(selector)?,
+                    strength,
+                    max_batch_size,
+                )?))
+            }
+            #[cfg(feature = "opencl")]
+            BatcherType::FromDevice(device) => Ok(Batcher::OpenCL(
+                CLBatchHasher::<A>::new_with_strength(&device, strength, max_batch_size)?,
+            )),
         }
     }
 
@@ -102,6 +142,14 @@ where
             _ => None,
         }
     }
+
+    #[cfg(feature = "opencl")]
+    pub(crate) fn device(&self) -> Option<opencl::Device> {
+        match self {
+            Batcher::OpenCL(b) => Some(b.device()),
+            _ => None,
+        }
+    }
 }
 
 impl<A> BatchHasher<A> for Batcher<A>
@@ -110,42 +158,21 @@ where
 {
     fn hash(&mut self, preimages: &[GenericArray<Fr, A>]) -> Result<Vec<Fr>, Error> {
         match self {
-            Batcher::GPU(batcher) => batcher.hash(preimages),
             Batcher::CPU(batcher) => batcher.hash(preimages),
+            #[cfg(feature = "gpu")]
+            Batcher::GPU(batcher) => batcher.hash(preimages),
+            #[cfg(feature = "opencl")]
+            Batcher::OpenCL(batcher) => batcher.hash(preimages),
         }
     }
 
     fn max_batch_size(&self) -> usize {
         match self {
-            Batcher::GPU(batcher) => batcher.max_batch_size(),
             Batcher::CPU(batcher) => batcher.max_batch_size(),
+            #[cfg(feature = "gpu")]
+            Batcher::GPU(batcher) => batcher.max_batch_size(),
+            #[cfg(feature = "opencl")]
+            Batcher::OpenCL(batcher) => batcher.max_batch_size(),
         }
     }
 }
-
-// /// NoGPUBatchHasher is a dummy required so we can build with the gpu flag even on platforms on which we cannot currently
-// /// run with GPU.
-pub struct NoGPUBatchHasher<A>(PhantomData<A>);
-
-impl<A> BatchHasher<A> for NoGPUBatchHasher<A>
-where
-    A: Arity<Fr>,
-{
-    fn hash(&mut self, _preimages: &[GenericArray<Fr, A>]) -> Result<Vec<Fr>, Error> {
-        unimplemented!();
-    }
-
-    fn max_batch_size(&self) -> usize {
-        unimplemented!();
-    }
-}
-
-#[cfg(feature = "gpu")]
-impl<A> NoGPUBatchHasher<A>
-where
-    A: Arity<Fr>,
-{
-    fn futhark_context(&self) -> Arc<Mutex<FutharkContext>> {
-        unimplemented!()
-    }
-}
diff --git a/src/column_tree_builder.rs b/src/column_tree_builder.rs
index 0097a3c0..fc8bbde7 100644
--- a/src/column_tree_builder.rs
+++ b/src/column_tree_builder.rs
@@ -120,16 +120,27 @@ where
 
         let tree_builder = match {
             match &column_batcher {
+                #[cfg(feature = "gpu")]
                 Some(b) => b.futhark_context(),
+                #[cfg(feature = "opencl")]
+                Some(b) => b.device(),
                 None => None,
             }
         } {
+            #[cfg(feature = "gpu")]
             Some(ctx) => TreeBuilder::<TreeArity>::new(
                 Some(BatcherType::FromFutharkContext(ctx)),
                 leaf_count,
                 max_tree_batch_size,
                 0,
             )?,
+            #[cfg(feature = "opencl")]
+            Some(device) => TreeBuilder::<TreeArity>::new(
+                Some(BatcherType::FromDevice(device)),
+                leaf_count,
+                max_tree_batch_size,
+                0,
+            )?,
             None => TreeBuilder::<TreeArity>::new(t, leaf_count, max_tree_batch_size, 0)?,
         };
 
@@ -162,7 +173,7 @@ where
     }
 }
 
-#[cfg(all(feature = "gpu", not(target_os = "macos")))]
+#[cfg(all(any(feature = "gpu", feature = "opencl"), not(target_os = "macos")))]
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -181,6 +192,9 @@ mod tests {
 
         #[cfg(feature = "gpu")]
         test_column_tree_builder_aux(Some(BatcherType::GPU), 512, 32, 512, 512);
+
+        #[cfg(feature = "opencl")]
+        test_column_tree_builder_aux(Some(BatcherType::OpenCL), 512, 32, 512, 512);
     }
 
     fn test_column_tree_builder_aux(
diff --git a/src/error.rs b/src/error.rs
index 6b54f343..f90233d2 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -1,7 +1,47 @@
 #[cfg(feature = "gpu")]
-use crate::cl;
+use crate::triton::cl;
 use std::{error, fmt};
 
+#[derive(Debug, Clone)]
+#[cfg(any(feature = "gpu", feature = "opencl"))]
+pub enum ClError {
+    DeviceNotFound,
+    PlatformNotFound,
+    BusIdNotAvailable,
+    NvidiaBusIdNotAvailable,
+    AmdTopologyNotAvailable,
+    PlatformNameNotAvailable,
+    CannotCreateContext,
+    CannotCreateQueue,
+    GetDeviceError,
+}
+
+#[cfg(any(feature = "gpu", feature = "opencl"))]
+pub type ClResult<T> = std::result::Result<T, ClError>;
+
+#[cfg(any(feature = "gpu", feature = "opencl"))]
+impl fmt::Display for ClError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
+        match self {
+            ClError::DeviceNotFound => write!(f, "Device not found."),
+            ClError::PlatformNotFound => write!(f, "Platform not found."),
+            ClError::BusIdNotAvailable => write!(f, "Cannot extract bus-id for the given device."),
+            ClError::NvidiaBusIdNotAvailable => {
+                write!(f, "Cannot extract bus-id for the given Nvidia device.")
+            }
+            ClError::AmdTopologyNotAvailable => {
+                write!(f, "Cannot extract bus-id for the given AMD device.")
+            }
+            ClError::PlatformNameNotAvailable => {
+                write!(f, "Cannot extract platform name for the given platform.")
+            }
+            ClError::CannotCreateContext => write!(f, "Cannot create cl_context."),
+            ClError::CannotCreateQueue => write!(f, "Cannot create cl_command_queue."),
+            ClError::GetDeviceError => write!(f, "Cannot get Device"),
+        }
+    }
+}
+
 #[derive(Debug, Clone)]
 /// Possible error states for the hashing.
 pub enum Error {
@@ -11,8 +51,8 @@ pub enum Error {
     IndexOutOfBounds,
     /// The provided leaf was not found in the tree
     GPUError(String),
-    #[cfg(feature = "gpu")]
-    ClError(cl::ClError),
+    #[cfg(any(feature = "gpu", feature = "opencl"))]
+    ClError(ClError),
     #[cfg(feature = "gpu")]
     TritonError(String),
     DecodingError,
@@ -20,8 +60,8 @@ pub enum Error {
 }
 
 #[cfg(feature = "gpu")]
-impl From<cl::ClError> for Error {
-    fn from(e: cl::ClError) -> Self {
+impl From<ClError> for Error {
+    fn from(e: ClError) -> Self {
         Self::ClError(e)
     }
 }
@@ -44,7 +84,7 @@ impl fmt::Display for Error {
             ),
             Error::IndexOutOfBounds => write!(f, "The referenced index is outs of bounds."),
             Error::GPUError(s) => write!(f, "GPU Error: {}", s),
-            #[cfg(feature = "gpu")]
+            #[cfg(any(feature = "gpu", feature = "opencl"))]
             Error::ClError(e) => write!(f, "OpenCL Error: {}", e),
             #[cfg(feature = "gpu")]
             Error::TritonError(e) => write!(f, "Neptune-triton Error: {}", e),
diff --git a/src/lib.rs b/src/lib.rs
index 2b1d15fd..51d503d2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,6 +11,9 @@ pub use error::Error;
 use ff::{Field, PrimeField, ScalarEngine};
 use generic_array::GenericArray;
 
+#[cfg(all(feature = "gpu", feature = "opencl"))]
+compile_error!("gpu and opencl features are mutually exclusive");
+
 /// Poseidon circuit
 pub mod circuit;
 pub mod error;
@@ -27,23 +30,23 @@ mod round_constants;
 pub mod hash_type;
 
 /// Tree Builder
-#[cfg(feature = "gpu")]
+#[cfg(any(feature = "gpu", feature = "opencl"))]
 pub mod tree_builder;
 
 /// Column Tree Builder
-#[cfg(feature = "gpu")]
+#[cfg(any(feature = "gpu", feature = "opencl"))]
 pub mod column_tree_builder;
 
 #[cfg(feature = "gpu")]
-mod gpu;
-
-#[cfg(feature = "gpu")]
-pub mod cl;
+pub mod triton;
 
 /// Batch Hasher
-#[cfg(feature = "gpu")]
+#[cfg(any(feature = "gpu", feature = "opencl"))]
 pub mod batch_hasher;
 
+#[cfg(feature = "opencl")]
+pub mod proteus;
+
 pub(crate) const TEST_SEED: [u8; 16] = [
     0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, 0xe5,
 ];
diff --git a/src/poseidon.rs b/src/poseidon.rs
index 79baec6c..0a039162 100644
--- a/src/poseidon.rs
+++ b/src/poseidon.rs
@@ -452,7 +452,6 @@ where
         ) {
             element.add_assign(round_constant);
         }
-
         self.constants_offset += self.elements.len();
     }
 
diff --git a/src/proteus/cl/poseidon.cl b/src/proteus/cl/poseidon.cl
new file mode 100644
index 00000000..58356c02
--- /dev/null
+++ b/src/proteus/cl/poseidon.cl
@@ -0,0 +1,178 @@
+typedef struct state {{
+  {field} elements[{width}];
+  int current_round;
+  int rk_offset;
+}} state;
+
+void debug_f({field} f) {{
+    {field}_print({field}_unmont(f));
+    printf("\n");
+}}
+void debug(state s) {{
+    if (get_global_id(0) == 0) {{
+        printf("state: ");
+        for (int i = 0; i < {width}; ++i) {{
+            {field} x = s.elements[i];
+            debug_f(x);
+          }}
+        printf("\n");
+      }}
+}}
+
+void debug_vec(__constant {field} v[], int size) {{
+    if (get_global_id(0) == 0) {{
+        for (int i = 0; i < size; ++i) {{
+            {field} x = v[i];
+            debug_f(x);
+          }}
+        printf("\n");
+      }}
+}}
+
+{field} quintic_s_box({field} l, {field} pre_add, {field} post_add) {{
+    {field} tmp = {field}_add(l, pre_add);
+    tmp = {field}_sqr(l);
+    tmp = {field}_sqr(tmp);
+    tmp = {field}_mul(tmp, l);
+    tmp = {field}_add(tmp, post_add); 
+
+    return tmp;
+  }}
+
+state add_round_key(__constant {field} constants[{constants_elements}], state s, int i) {{
+    s.elements[i] = {field}_add(s.elements[i], (constants + {round_keys_offset})[s.rk_offset + i]);
+    return s;
+}}
+
+state apply_matrix (__constant {field} matrix[{width}][{width}], state s) {{
+    {field} tmp[{width}];
+    for (int i = 0; i < {width}; ++i) {{
+        tmp[i] = s.elements[i];
+        s.elements[i] = {field}_ZERO;
+      }}
+
+    int size = {width}*{width};
+    for (int j = 0; j < {width}; ++j) {{
+        for (int i = 0; i < {width}; ++i) {{
+            s.elements[j] = {field}_add(s.elements[j], {field}_mul(matrix[i][j], tmp[i]));
+          }}
+      }}
+    return s;
+  }}
+
+{field} scalar_product(__constant {field}* a, {field}* b, int size) {{
+    {field} res = {field}_ZERO;
+
+    for (int i = 0; i < size; ++i) {{
+        {field} tmp = {field}_mul(a[i], b[i]);
+        res = {field}_add(res, tmp);
+      }}
+
+    return res;
+  }}
+
+state apply_sparse_matrix (__constant {field} sm[{sparse_matrix_size}], state s) {{
+    {field} first_elt = s.elements[0];
+
+    s.elements[0] = scalar_product(sm + {w_hat_offset}, s.elements, {width});
+
+    for (int i = 1; i < {width}; ++i) {{
+        {field} val = {field}_mul((sm + {v_rest_offset})[i-1], first_elt);
+        s.elements[i] = {field}_add(s.elements[i], val);
+      }}
+
+    return s;
+  }}
+
+state apply_round_matrix (__constant {field} constants[{constants_elements}], state s) {{
+    if (s.current_round == {sparse_offset}) {{
+        s = apply_matrix(constants + {pre_sparse_matrix_offset}, s);
+      }} else if ((s.current_round > {sparse_offset}) && (s.current_round < {full_half} + {partial_rounds})) {{
+        int index = s.current_round - {sparse_offset} - 1;
+        s = apply_sparse_matrix(constants + {sparse_matrixes_offset} + (index * {sparse_matrix_size}), s);
+      }} else {{
+        s = apply_matrix(constants + {mds_matrix_offset}, s);
+      }}
+    return s;
+  }}
+
+state add_full_round_keys (__constant {field} constants[{constants_elements}], state s) {{
+    for (int i = 0; i < {width}; ++i) {{
+        s = add_round_key(constants, s, i);
+      }}
+    s.rk_offset += {width};
+    return s;
+  }}
+
+state add_partial_round_key (__constant {field} constants[{constants_elements}], state s) {{
+    s = add_round_key(constants, s, 0);
+    s.rk_offset += 1;
+    return s;
+}}
+
+state full_round (__constant {field} constants[{constants_elements}], state s) {{
+    for (int i = 0; i < {width}; ++i) {{
+        s.elements[i] = quintic_s_box(s.elements[i], {field}_ZERO, (constants + {round_keys_offset})[s.rk_offset + i]);
+      }}
+    s.rk_offset += {width};
+    s = apply_round_matrix(constants, s);
+    s.current_round += 1;
+    return s;
+}}
+
+state last_full_round (__constant {field} constants[{constants_elements}], state s) {{
+    for (int i = 0; i < {width}; ++i) {{
+        s.elements[i] = quintic_s_box(s.elements[i], {field}_ZERO, {field}_ZERO);
+      }}
+    s = apply_round_matrix(constants, s);
+    return s;
+}}
+
+state partial_round (__constant {field} constants[{constants_elements}], state s) {{
+    s.elements[0] = quintic_s_box(s.elements[0], {field}_ZERO, (constants + {round_keys_offset})[s.rk_offset]);
+    s.rk_offset += 1;
+    s = apply_round_matrix(constants, s);
+    s.current_round += 1;
+    return s;
+}}
+
+state hash (__constant {field} constants[{constants_elements}], state s) {{
+    s = add_full_round_keys(constants, s);
+
+    for (int i = 0; i < {full_half}; ++i) {{
+        s = full_round(constants, s);
+      }}
+    for (int i = 0; i < {partial_rounds}; ++ i) {{
+        s = partial_round(constants, s);
+      }}
+    for (int i = 0; i < ({full_half} - 1); ++ i) {{
+        s = full_round(constants, s);
+      }}
+    s = last_full_round(constants, s);
+
+    return s;
+  }}
+
+__kernel void hash_preimages(__constant {field} constants[{constants_elements}],
+                             __global {field} *preimages,
+                             __global {field} *digests,
+                             int batch_size
+                             ) {{
+    int global_id = get_global_id(0);
+
+    if (global_id < batch_size) {{
+        int offset = global_id * {arity};
+
+        state s;
+        s.elements[0] = constants[{domain_tag_offset}];
+        for (int i = 0; i < {arity}; ++i) {{
+            s.elements[i+1] = preimages[offset + i];
+          }}
+        s.current_round = 0;
+        s.rk_offset = 0;
+
+        s = hash(constants, s);
+
+        digests[global_id] = s.elements[1];
+      }}
+  }}
diff --git a/src/proteus/gpu.rs b/src/proteus/gpu.rs
new file mode 100644
index 00000000..48c28976
--- /dev/null
+++ b/src/proteus/gpu.rs
@@ -0,0 +1,318 @@
+use super::sources::generate_program;
+use crate::error::{ClError, Error};
+use crate::hash_type::HashType;
+use crate::poseidon::PoseidonConstants;
+use crate::{Arity, BatchHasher, Strength, DEFAULT_STRENGTH};
+use bellperson::bls::{Bls12, Fr, FrRepr};
+use ff::{Field, PrimeField, PrimeFieldDecodingError};
+use generic_array::{typenum, ArrayLength, GenericArray};
+use log::info;
+use rust_gpu_tools::opencl::{cl_device_id, Device, GPUSelector};
+use rust_gpu_tools::{call_kernel, opencl};
+use std::collections::HashMap;
+use std::marker::PhantomData;
+use typenum::{U11, U2, U8};
+
+#[derive(Debug)]
+struct GPUConstants<A>(PoseidonConstants<Bls12, A>)
+where
+    A: Arity<Fr>;
+
+pub struct CLBatchHasher<A>
+where
+    A: Arity<Fr>,
+{
+    device: opencl::Device,
+    constants: GPUConstants<A>,
+    constants_buffer: opencl::Buffer<Fr>,
+    max_batch_size: usize,
+    program: opencl::Program,
+}
+
+pub struct DerivedConstants {
+    pub arity: usize,
+    pub partial_rounds: usize,
+    pub width: usize,
+    pub sparse_matrix_size: usize,
+    pub full_half: usize,
+    pub sparse_offset: usize,
+    pub constants_elements: usize,
+
+    // Offsets
+    pub domain_tag_offset: usize,
+    pub round_keys_offset: usize,
+    pub mds_matrix_offset: usize,
+    pub pre_sparse_matrix_offset: usize,
+    pub sparse_matrixes_offset: usize,
+    pub w_hat_offset: usize,
+    pub v_rest_offset: usize,
+}
+
+impl<A> GPUConstants<A>
+where
+    A: Arity<Fr>,
+{
+    fn derived_constants(&self) -> DerivedConstants {
+        let c = &self.0;
+        let arity = c.arity();
+        let full_rounds = c.full_rounds;
+        let partial_rounds = c.partial_rounds;
+        let sparse_count = partial_rounds;
+        let width = arity + 1;
+        let sparse_matrix_size = 2 * width - 1;
+        let rk_count = width * full_rounds + partial_rounds;
+        let full_half = full_rounds / 2;
+        let sparse_offset = full_half - 1;
+        let constants_elements =
+            1 + rk_count + (width * width) + (width * width) + (sparse_count * sparse_matrix_size);
+
+        let matrix_size = width * width;
+        let mut offset = 0;
+        let domain_tag_offset = offset;
+        offset += 1;
+        let round_keys_offset = offset;
+        offset += rk_count;
+        let mds_matrix_offset = offset;
+        offset += matrix_size;
+        let pre_sparse_matrix_offset = offset;
+        offset += matrix_size;
+        let sparse_matrixes_offset = offset;
+
+        let w_hat_offset = 0;
+        let v_rest_offset = width;
+
+        DerivedConstants {
+            arity,
+            partial_rounds,
+            width,
+            sparse_matrix_size,
+            full_half,
+            sparse_offset,
+            constants_elements,
+            domain_tag_offset,
+            round_keys_offset,
+            mds_matrix_offset,
+            pre_sparse_matrix_offset,
+            sparse_matrixes_offset,
+            w_hat_offset,
+            v_rest_offset,
+        }
+    }
+}
+
+impl<A> GPUConstants<A>
+where
+    A: Arity<Fr>,
+{
+    fn full_rounds(&self) -> usize {
+        self.0.full_rounds
+    }
+
+    fn partial_rounds(&self) -> usize {
+        self.0.partial_rounds
+    }
+
+    fn to_buffer(&self, program: &opencl::Program) -> Result<opencl::Buffer<Fr>, Error> {
+        let DerivedConstants {
+            arity: _,
+            partial_rounds: _,
+            width: _,
+            sparse_matrix_size: _,
+            full_half: _,
+            sparse_offset: _,
+            constants_elements,
+            domain_tag_offset,
+            round_keys_offset,
+            mds_matrix_offset,
+            pre_sparse_matrix_offset,
+            sparse_matrixes_offset,
+            w_hat_offset: _,
+            v_rest_offset: _,
+        } = self.derived_constants();
+
+        let mut buffer = program
+            .create_buffer::<Fr>(constants_elements)
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+
+        let c = &self.0;
+
+        buffer
+            .write_from(domain_tag_offset, &[c.domain_tag])
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+        buffer
+            .write_from(round_keys_offset, &c.compressed_round_constants)
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+        buffer
+            .write_from(
+                mds_matrix_offset,
+                c.mds_matrices
+                    .m
+                    .iter()
+                    .flatten()
+                    .cloned()
+                    .collect::<Vec<_>>()
+                    .as_slice(),
+            )
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+        buffer
+            .write_from(
+                pre_sparse_matrix_offset,
+                c.pre_sparse_matrix
+                    .iter()
+                    .flatten()
+                    .cloned()
+                    .collect::<Vec<_>>()
+                    .as_slice(),
+            )
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+        let mut sm_elts = Vec::new();
+        for sm in c.sparse_matrixes.iter() {
+            sm_elts.extend(sm.w_hat.iter());
+            sm_elts.extend(sm.v_rest.iter());
+        }
+        buffer
+            .write_from(sparse_matrixes_offset, &sm_elts)
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+
+        Ok(buffer)
+    }
+}
+
+pub fn get_device(selector: &GPUSelector) -> Result<&'static opencl::Device, Error> {
+    if let Some(device) = selector.get_device() {
+        info!("device: {:?}", device);
+        Ok(device)
+    } else {
+        return Err(Error::ClError(ClError::BusIdNotAvailable));
+    }
+}
+
+impl<A> CLBatchHasher<A>
+where
+    A: Arity<Fr>,
+{
+    /// Create a new `GPUBatchHasher` and initialize it with state corresponding with its `A`.
+    pub(crate) fn new(selector: &GPUSelector, max_batch_size: usize) -> Result<Self, Error> {
+        let device = get_device(selector)?;
+        Self::new_with_strength(device, DEFAULT_STRENGTH, max_batch_size)
+    }
+
+    pub(crate) fn new_with_strength(
+        device: &opencl::Device,
+        strength: Strength,
+        max_batch_size: usize,
+    ) -> Result<Self, Error> {
+        let constants = GPUConstants(PoseidonConstants::<Bls12, A>::new_with_strength(strength));
+        let src = generate_program::<Fr>(true, constants.derived_constants());
+        let program = opencl::Program::from_opencl(device.clone(), &src)
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+        let constants_buffer = constants.to_buffer(&program)?;
+        Ok(Self {
+            device: device.clone(),
+            constants,
+            constants_buffer,
+            max_batch_size,
+            program,
+        })
+    }
+
+    pub(crate) fn device(&self) -> opencl::Device {
+        self.device.clone()
+    }
+}
+const LOCAL_WORK_SIZE: usize = 256;
+impl<A> BatchHasher<A> for CLBatchHasher<A>
+where
+    A: Arity<Fr>,
+{
+    fn hash(&mut self, preimages: &[GenericArray<Fr, A>]) -> Result<Vec<Fr>, Error> {
+        let local_work_size = LOCAL_WORK_SIZE;
+        let max_batch_size = self.max_batch_size;
+        let batch_size = preimages.len();
+        assert!(batch_size <= max_batch_size);
+
+        // Set `global_work_size` to smallest multiple of `local_work_size` >= `batch-size`.
+        let global_work_size = ((batch_size / local_work_size)
+            + (batch_size % local_work_size != 0) as usize)
+            * local_work_size;
+
+        let num_hashes = preimages.len();
+
+        let kernel =
+            self.program
+                .create_kernel("hash_preimages", global_work_size, Some(local_work_size));
+
+        let mut preimages_buffer = self
+            .program
+            .create_buffer::<GenericArray<Fr, A>>(num_hashes)
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+
+        preimages_buffer
+            .write_from(0, preimages)
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+        let result_buffer = self
+            .program
+            .create_buffer::<Fr>(num_hashes)
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+
+        call_kernel!(
+            kernel,
+            &self.constants_buffer,
+            &preimages_buffer,
+            &result_buffer,
+            preimages.len() as i32
+        )
+        .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+
+        let mut frs = vec![<Fr as Field>::zero(); num_hashes];
+        result_buffer
+            .read_into(0, &mut frs)
+            .map_err(|e| Error::GPUError(format!("{:?}", e)))?;
+        Ok(frs.to_vec())
+    }
+
+    fn max_batch_size(&self) -> usize {
+        self.max_batch_size
+    }
+}
+
+#[cfg(test)]
+#[cfg(all(feature = "opencl", not(target_os = "macos")))]
+mod test {
+    use super::*;
+    use crate::poseidon::{Poseidon, SimplePoseidonBatchHasher};
+    use generic_array::sequence::GenericSequence;
+    use rand::SeedableRng;
+    use rand_xorshift::XorShiftRng;
+
+    #[test]
+    fn test_batch_hash_2() {
+        let mut rng = XorShiftRng::from_seed(crate::TEST_SEED);
+        let device = get_device(&GPUSelector::Index(0)).unwrap();
+
+        // NOTE: `batch_size` is not a multiple of `LOCAL_WORK_SIZE`.
+        let batch_size = 1025;
+
+        let mut cl_hasher =
+            CLBatchHasher::<U2>::new_with_strength(device, Strength::Standard, batch_size).unwrap();
+        let mut simple_hasher =
+            SimplePoseidonBatchHasher::<U2>::new_with_strength(Strength::Standard, batch_size)
+                .unwrap();
+
+        let preimages = (0..batch_size)
+            .map(|_| GenericArray::<Fr, U2>::generate(|_| Fr::random(&mut rng)))
+            .collect::<Vec<_>>();
+
+        let cl_hashes = cl_hasher.hash(&preimages).unwrap();
+        let expected_hashes: Vec<_> = simple_hasher.hash(&preimages).unwrap();
+
+        dbg!(
+            &cl_hashes,
+            &expected_hashes,
+            &cl_hashes.len(),
+            &expected_hashes.len()
+        );
+
+        assert_eq!(expected_hashes, cl_hashes);
+    }
+}
diff --git a/src/proteus/mod.rs b/src/proteus/mod.rs
new file mode 100644
index 00000000..b2c115b8
--- /dev/null
+++ b/src/proteus/mod.rs
@@ -0,0 +1,2 @@
+pub mod gpu;
+pub mod sources;
diff --git a/src/proteus/sources.rs b/src/proteus/sources.rs
new file mode 100644
index 00000000..358f97f9
--- /dev/null
+++ b/src/proteus/sources.rs
@@ -0,0 +1,65 @@
+use super::gpu::DerivedConstants;
+use bellperson::bls::{Engine, Fr};
+use ff::PrimeField;
+use itertools::join;
+
+fn config() -> String {
+    "".to_string()
+}
+
+fn poseidon_source(field: &str, derived_constants: DerivedConstants) -> String {
+    let DerivedConstants {
+        arity,
+        partial_rounds,
+        width,
+        sparse_matrix_size,
+        full_half,
+        sparse_offset,
+        constants_elements,
+        domain_tag_offset,
+        round_keys_offset,
+        mds_matrix_offset,
+        pre_sparse_matrix_offset,
+        sparse_matrixes_offset,
+        w_hat_offset,
+        v_rest_offset,
+    } = derived_constants;
+
+    format!(
+        include_str!("cl/poseidon.cl"),
+        arity = arity,
+        field = field,
+        partial_rounds = partial_rounds,
+        width = width,
+        sparse_matrix_size = sparse_matrix_size,
+        full_half = full_half,
+        sparse_offset = sparse_offset,
+        constants_elements = constants_elements,
+        domain_tag_offset = domain_tag_offset,
+        round_keys_offset = round_keys_offset,
+        mds_matrix_offset = mds_matrix_offset,
+        pre_sparse_matrix_offset = pre_sparse_matrix_offset,
+        w_hat_offset = w_hat_offset,
+        v_rest_offset = v_rest_offset,
+        sparse_matrixes_offset = sparse_matrixes_offset,
+    )
+}
+
+pub fn generate_program<Fr>(limb64: bool, derived_constants: DerivedConstants) -> String
+where
+    Fr: PrimeField,
+{
+    let field_source = if limb64 {
+        ff_cl_gen::field::<Fr, ff_cl_gen::Limb64>("Fr")
+    } else {
+        ff_cl_gen::field::<Fr, ff_cl_gen::Limb32>("Fr")
+    };
+    join(
+        &[
+            config(),
+            field_source,
+            poseidon_source("Fr", derived_constants),
+        ],
+        "\n",
+    )
+}
diff --git a/src/tree_builder.rs b/src/tree_builder.rs
index 044f3437..18f10c6e 100644
--- a/src/tree_builder.rs
+++ b/src/tree_builder.rs
@@ -247,7 +247,7 @@ where
     }
 }
 
-#[cfg(all(feature = "gpu", not(target_os = "macos")))]
+#[cfg(all(any(feature = "gpu", feature = "opencl"), not(target_os = "macos")))]
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -263,6 +263,9 @@ mod tests {
 
         #[cfg(all(feature = "gpu", not(target_os = "macos")))]
         test_tree_builder_aux(Some(BatcherType::GPU), 512, 32, 512, 512);
+
+        #[cfg(all(feature = "opencl", not(target_os = "macos")))]
+        test_tree_builder_aux(Some(BatcherType::OpenCL), 512, 32, 512, 512);
     }
 
     fn test_tree_builder_aux(
diff --git a/src/cl.rs b/src/triton/cl.rs
similarity index 69%
rename from src/cl.rs
rename to src/triton/cl.rs
index 423b96cb..c5cf187e 100644
--- a/src/cl.rs
+++ b/src/triton/cl.rs
@@ -1,3 +1,4 @@
+use crate::error::{ClError, ClResult};
 use log::*;
 use rust_gpu_tools::opencl::{cl_device_id, Device, GPUSelector};
 use std::collections::HashMap;
@@ -8,57 +9,11 @@ use triton::bindings;
 use triton::FutharkContext;
 const MAX_LEN: usize = 128;
 
-#[repr(C)]
-#[derive(Debug, Clone, Default)]
-struct cl_amd_device_topology {
-    r#type: u32,
-    unused: [u8; 17],
-    bus: u8,
-    device: u8,
-    function: u8,
-}
-
 lazy_static! {
     pub static ref FUTHARK_CONTEXT_MAP: RwLock<HashMap<String, Arc<Mutex<FutharkContext>>>> =
         RwLock::new(HashMap::new());
 }
 
-#[derive(Debug, Clone)]
-pub enum ClError {
-    DeviceNotFound,
-    PlatformNotFound,
-    BusIdNotAvailable,
-    NvidiaBusIdNotAvailable,
-    AmdTopologyNotAvailable,
-    PlatformNameNotAvailable,
-    CannotCreateContext,
-    CannotCreateQueue,
-    GetDeviceError,
-}
-pub type ClResult<T> = std::result::Result<T, ClError>;
-
-impl fmt::Display for ClError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
-        match self {
-            ClError::DeviceNotFound => write!(f, "Device not found."),
-            ClError::PlatformNotFound => write!(f, "Platform not found."),
-            ClError::BusIdNotAvailable => write!(f, "Cannot extract bus-id for the given device."),
-            ClError::NvidiaBusIdNotAvailable => {
-                write!(f, "Cannot extract bus-id for the given Nvidia device.")
-            }
-            ClError::AmdTopologyNotAvailable => {
-                write!(f, "Cannot extract bus-id for the given AMD device.")
-            }
-            ClError::PlatformNameNotAvailable => {
-                write!(f, "Cannot extract platform name for the given platform.")
-            }
-            ClError::CannotCreateContext => write!(f, "Cannot create cl_context."),
-            ClError::CannotCreateQueue => write!(f, "Cannot create cl_command_queue."),
-            ClError::GetDeviceError => write!(f, "Cannot get Device"),
-        }
-    }
-}
-
 fn create_context(device: bindings::cl_device_id) -> ClResult<bindings::cl_context> {
     let mut res = 0i32;
     let context = unsafe {
diff --git a/src/gpu.rs b/src/triton/gpu.rs
similarity index 99%
rename from src/gpu.rs
rename to src/triton/gpu.rs
index cd4fef29..63aad589 100644
--- a/src/gpu.rs
+++ b/src/triton/gpu.rs
@@ -1,4 +1,4 @@
-use crate::cl;
+use super::cl;
 use crate::error::Error;
 use crate::hash_type::HashType;
 use crate::poseidon::PoseidonConstants;
@@ -630,8 +630,8 @@ fn u64_vec<'a, U: ArrayLength<Fr>>(vec: &'a [GenericArray<Fr, U>]) -> Vec<u64> {
 #[cfg(all(feature = "gpu", not(target_os = "macos")))]
 mod tests {
     use super::*;
-    use crate::gpu::BatcherState;
     use crate::poseidon::{Poseidon, SimplePoseidonBatchHasher};
+    use crate::triton::gpu::BatcherState;
     use crate::BatchHasher;
     use ff::{Field, ScalarEngine};
     use generic_array::sequence::GenericSequence;
@@ -667,6 +667,7 @@ mod tests {
 
         let (hashes, _) =
             mbatch_hash2(&mut ctx.lock().unwrap(), &mut state, preimages.as_slice()).unwrap();
+
         let gpu_hashes = gpu_hasher.hash(&preimages).unwrap();
         let expected_hashes: Vec<_> = simple_hasher.hash(&preimages).unwrap();
 
diff --git a/src/triton/mod.rs b/src/triton/mod.rs
new file mode 100644
index 00000000..8bd7fdd8
--- /dev/null
+++ b/src/triton/mod.rs
@@ -0,0 +1,2 @@
+pub mod cl;
+pub mod gpu;