From 17364e7dbc5bcad584fcb6a9753991c01681ada7 Mon Sep 17 00:00:00 2001
From: porcuquine <porcuquine@users.noreply.github.com>
Date: Thu, 25 Feb 2021 02:47:30 +0000
Subject: [PATCH] Optimize opencl and make it default gpu feature.

---
 .circleci/config.yml                          | 124 ++++++++++++++----
 fil-proofs-param/Cargo.toml                   |   1 -
 fil-proofs-tooling/Cargo.toml                 |   8 --
 filecoin-hashers/Cargo.toml                   |   3 +-
 filecoin-proofs/Cargo.toml                    |   8 --
 storage-proofs-core/Cargo.toml                |   3 +-
 storage-proofs-porep/Cargo.toml               |   3 +-
 .../src/stacked/vanilla/proof.rs              | 116 ++++++++--------
 storage-proofs-post/Cargo.toml                |   1 -
 9 files changed, 155 insertions(+), 112 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a0a54da0d0..c3041f5b22 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -11,6 +11,11 @@ executors:
       - image: filecoin/rust:latest
     working_directory: /mnt/crate
     resource_class: 2xlarge+
+  gpu:
+    machine:
+      image: ubuntu-1604-cuda-10.1:201909-23
+    working_directory: ~/gpuci
+    resource_class: gpu.nvidia.medium
 
 setup-env: &setup-env
   FIL_PROOFS_PARAMETER_CACHE: "/root/filecoin-proof-parameters/"
@@ -27,6 +32,16 @@ jobs:
       - ensure_filecoin_parameters
       - save_parameter_cache
 
+  ensure_groth_parameters_and_keys_linux_gpu:
+    executor: gpu
+    environment: *setup-env
+    steps:
+      - checkout
+      - cargo_fetch_gpu
+      - restore_parameter_cache_gpu
+      - ensure_filecoin_parameters_gpu
+      - save_parameter_cache
+
   cargo_fetch:
     executor: default
     environment: *setup-env
@@ -57,6 +72,43 @@ jobs:
           paths:
             - /root/.cargo
             - /root/.rustup
+  cargo_fetch_gpu:
+    executor: gpu
+    environment: *setup-env
+    steps:
+      - checkout
+      - run: curl https://sh.rustup.rs -sSf | sh -s -- -y
+      - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
+      - run: echo $BASH_ENV
+      - run: echo $HOME
+      - run: source $BASH_ENV
+      - run: cargo --version
+      - run: rustc --version
+      - run:
+          name: Calculate dependencies
+          command: cargo generate-lockfile
+          no_output_timeout: 30m
+      - restore_cache:
+          keys:
+            - cargo-v28-gpu-d-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
+      - run: rustup install $(cat rust-toolchain)
+      - run: rustup default $(cat rust-toolchain)
+      - run: rustup install << pipeline.parameters.nightly-toolchain >>
+      - run: rustup component add rustfmt-preview
+      - run: rustup component add clippy
+      - run: cargo update
+      - run: cargo fetch
+      - run: rustc +$(cat rust-toolchain) --version
+      - run: rustup toolchain list --verbose
+      - persist_to_workspace:
+          root: ~/gpuci
+          paths:
+            - Cargo.lock
+      - save_cache:
+          key: cargo-v28-gpu-d-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
+          paths:
+            - "~/.cargo"
+            - "~/.rustup"
   test:
     executor: default
     environment: *setup-env
@@ -152,7 +204,6 @@ jobs:
           environment:
             RUST_TEST_THREADS: 1
             FIL_PROOFS_USE_MULTICORE_SDR: true
-
       - run:
           name: Test with use_multicore_sdr and blst enabled
           command: |
@@ -165,10 +216,8 @@ jobs:
             RUST_TEST_THREADS: 1
             FIL_PROOFS_USE_MULTICORE_SDR: true
 
-  # Running with `use_multicore_sdr=true` should be integrated directly into the test code. For now we
-  # just re-run the lifecycle tests to exercise the use_multicore_sdr code path with that setting set.
-  test_multicore_sdr_gpu2:
-    executor: default
+  test_gpu_tree_building:
+    executor: gpu
     environment: *setup-env
     steps:
       - checkout
@@ -176,33 +225,25 @@ jobs:
           at: "."
       - restore_cache:
           keys:
-            - cargo-v28-b-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
+            - cargo-v28-gpu-d-{{ checksum "rust-toolchain" }}-{{ checksum "Cargo.toml" }}-{{ checksum "Cargo.lock" }}-{{ arch }}
       - restore_parameter_cache
       - run:
-          name: Test with use_multicore_sdr pairing enabled
+          name: Test with GPU column and tree builders.
           command: |
+            sudo apt-get update -y
+            apt-cache search opencl
+            sudo apt install -y ocl-icd-opencl-dev
+            sudo apt install hwloc libhwloc-dev
             ulimit -n 20000
             ulimit -u 20000
             ulimit -n 20000
-            cargo +<< pipeline.parameters.nightly-toolchain >> -Zpackage-features test --all --verbose --release lifecycle -- --ignored --nocapture
-            cargo +<< pipeline.parameters.nightly-toolchain >> -Zpackage-features test -p storage-proofs-porep --features single-threaded --release checkout_cores -- --test-threads=1
+            cd filecoin-proofs
+            ~/.cargo/bin/cargo +<< pipeline.parameters.nightly-toolchain >> -Zpackage-features test --all --verbose --release lifecycle -- --ignored --nocapture
           no_output_timeout: 30m
           environment:
             RUST_TEST_THREADS: 1
-            FIL_PROOFS_USE_MULTICORE_SDR: true
-
-      - run:
-          name: Test with use_multicore_sdr and blst enabled
-          command: |
-            ulimit -n 20000
-            ulimit -u 20000
-            ulimit -n 20000
-            cargo +<< pipeline.parameters.nightly-toolchain >> -Zpackage-features test --all --no-default-features --features gpu2,blst --verbose --release  lifecycle -- --ignored --nocapture
-          no_output_timeout: 30m
-          environment:
-            RUST_TEST_THREADS: 1
-            FIL_PROOFS_USE_MULTICORE_SDR: true
-
+            FIL_PROOFS_USE_GPU_COLUMN_BUILDER: true
+            FIL_PROOFS_USE_GPU_TREE_BUILDER: true
 
   test_no_gpu:
     executor: default
@@ -226,7 +267,6 @@ jobs:
             cargo +<< pipeline.parameters.nightly-toolchain >> -Zpackage-features test --all --verbose --no-default-features --features blst
           no_output_timeout: 30m
 
-
   test_blst:
     executor: default
     environment: *setup-env
@@ -442,6 +482,18 @@ commands:
           name: Obtain filecoin groth parameters
           command: ~/paramcache.awesome --sector-sizes='2048,4096,16384,32768'
           no_output_timeout: 60m
+  ensure_filecoin_parameters_gpu:
+    steps:
+      - run:
+          name: Build paramcache if it doesn't already exist
+          command: |
+            set -x; test -f ~/paramcache.awesome \
+            || (~/.cargo/bin/cargo build --release --workspace && find . -type f -name paramcache | xargs -I {} mv {} ~/paramcache.awesome)
+      - run:
+          name: Obtain filecoin groth parameters
+          command: ~/paramcache.awesome --sector-sizes='2048,4096,16384,32768'
+          no_output_timeout: 60m
+
   save_parameter_cache:
     steps:
       - save_cache:
@@ -454,13 +506,29 @@ commands:
       - restore_cache:
          keys:
             - proof-params-v28-b-{{ checksum "filecoin-proofs/parameters.json" }}-{{ arch }}
+  save_parameter_cache_gpu:
+    steps:
+      - save_cache:
+          key: proof-params-v28-gpu-{{ checksum "filecoin-proofs/parameters.json" }}-{{ arch }}
+          paths:
+            - "~/paramcache.awesome"
+            - "~/filecoin-proof-parameters/"
+  restore_parameter_cache_gpu:
+    steps:
+      - restore_cache:
+         keys:
+            - proof-params-v28-gpu-{{ checksum "filecoin-proofs/parameters.json" }}-{{ arch }}
 
 workflows:
   version: 2.1
   test_all:
     jobs:
       - ensure_groth_parameters_and_keys_linux
+      - ensure_groth_parameters_and_keys_linux_gpu:
+          requires:
+            - cargo_fetch_gpu
       - cargo_fetch
+      - cargo_fetch_gpu
       - rustfmt:
           requires:
             - cargo_fetch
@@ -501,12 +569,12 @@ workflows:
             - cargo_fetch
             - ensure_groth_parameters_and_keys_linux
 
-      - test_multicore_sdr:
+      - test_gpu_tree_building:
           requires:
-            - cargo_fetch
-            - ensure_groth_parameters_and_keys_linux
+            - cargo_fetch_gpu
+            - ensure_groth_parameters_and_keys_linux_gpu
 
-      - test_multicore_sdr_gpu2:
+      - test_multicore_sdr:
           requires:
             - cargo_fetch
             - ensure_groth_parameters_and_keys_linux
diff --git a/fil-proofs-param/Cargo.toml b/fil-proofs-param/Cargo.toml
index c95e5b215f..35cbf24f31 100644
--- a/fil-proofs-param/Cargo.toml
+++ b/fil-proofs-param/Cargo.toml
@@ -68,6 +68,5 @@ heap-profile = ["gperftools/heap"]
 simd = ["storage-proofs-core/simd"]
 asm = ["storage-proofs-core/asm"]
 gpu = ["storage-proofs-core/gpu", "storage-proofs-porep/gpu", "storage-proofs-post/gpu", "bellperson/gpu"]
-gpu2 = ["storage-proofs-core/gpu2", "storage-proofs-porep/gpu2", "storage-proofs-post/gpu2", "bellperson/gpu"]
 pairing = ["storage-proofs-core/pairing", "storage-proofs-porep/pairing", "storage-proofs-post/pairing", "bellperson/pairing"]
 blst = ["storage-proofs-core/blst", "storage-proofs-porep/blst", "storage-proofs-post/blst", "bellperson/blst"]
diff --git a/fil-proofs-tooling/Cargo.toml b/fil-proofs-tooling/Cargo.toml
index 2ee49f2f97..4b85ebf363 100644
--- a/fil-proofs-tooling/Cargo.toml
+++ b/fil-proofs-tooling/Cargo.toml
@@ -62,14 +62,6 @@ gpu = [
     "bellperson/gpu",
     "filecoin-hashers/gpu",
 ]
-gpu2 = [
-    "storage-proofs-core/gpu2",
-    "storage-proofs-porep/gpu2",
-    "storage-proofs-post/gpu2",
-    "filecoin-proofs/gpu2",
-    "bellperson/gpu",
-    "filecoin-hashers/gpu2",
-]
 measurements = ["storage-proofs-core/measurements"]
 profile = ["storage-proofs-core/profile", "measurements"]
 pairing = [
diff --git a/filecoin-hashers/Cargo.toml b/filecoin-hashers/Cargo.toml
index b2413e416c..cf8f7d77eb 100644
--- a/filecoin-hashers/Cargo.toml
+++ b/filecoin-hashers/Cargo.toml
@@ -26,8 +26,7 @@ hex = "0.4.2"
 [features]
 default = ["gpu", "pairing", "blake2s", "poseidon", "sha256"]
 
-gpu = ["bellperson/gpu", "neptune/gpu"]
-gpu2 = ["bellperson/gpu", "neptune/opencl"]
+gpu = ["bellperson/gpu", "neptune/opencl"]
 
 pairing = ["bellperson/pairing", "neptune/pairing", "bellperson/pairing-serde"]
 blst = ["bellperson/blst", "neptune/blst", "bellperson/blst-serde"]
diff --git a/filecoin-proofs/Cargo.toml b/filecoin-proofs/Cargo.toml
index 564a458c31..5fcb1a19ae 100644
--- a/filecoin-proofs/Cargo.toml
+++ b/filecoin-proofs/Cargo.toml
@@ -62,14 +62,6 @@ gpu = [
     "filecoin-hashers/gpu",
     "fr32/gpu",
 ]
-gpu2 = [
-    "storage-proofs-core/gpu2",
-    "storage-proofs-porep/gpu2",
-    "storage-proofs-post/gpu2",
-    "bellperson/gpu",
-    "filecoin-hashers/gpu2",
-    "fr32/gpu",
-]
 pairing = [
     "storage-proofs-core/pairing",
     "storage-proofs-porep/pairing",
diff --git a/storage-proofs-core/Cargo.toml b/storage-proofs-core/Cargo.toml
index b59593464c..5d67e741c0 100644
--- a/storage-proofs-core/Cargo.toml
+++ b/storage-proofs-core/Cargo.toml
@@ -63,8 +63,7 @@ big-sector-sizes-bench = []
 measurements = ["cpu-time", "gperftools"]
 profile = ["measurements"]
 
-gpu = ["bellperson/gpu", "neptune/gpu", "filecoin-hashers/gpu", "fr32/gpu"]
-gpu2 = ["bellperson/gpu", "neptune/opencl", "filecoin-hashers/gpu2", "fr32/gpu"]
+gpu = ["bellperson/gpu", "neptune/opencl", "filecoin-hashers/gpu", "fr32/gpu"]
 pairing = ["bellperson/pairing", "neptune/pairing", "bellperson/pairing-serde", "filecoin-hashers/pairing", "fr32/pairing"]
 blst = ["bellperson/blst", "neptune/blst", "bellperson/blst-serde", "filecoin-hashers/blst", "fr32/blst"]
 
diff --git a/storage-proofs-porep/Cargo.toml b/storage-proofs-porep/Cargo.toml
index 0f5f6b67d1..0cb6e2bf9b 100644
--- a/storage-proofs-porep/Cargo.toml
+++ b/storage-proofs-porep/Cargo.toml
@@ -51,8 +51,7 @@ filecoin-hashers = { path = "../filecoin-hashers", version = "1.0.0", default-fe
 
 [features]
 default = ["pairing", "gpu"]
-gpu = ["storage-proofs-core/gpu", "filecoin-hashers/gpu", "neptune/gpu", "bellperson/gpu", "fr32/gpu"]
-gpu2 = ["storage-proofs-core/gpu2", "filecoin-hashers/gpu2", "neptune/opencl", "bellperson/gpu", "fr32/gpu"]
+gpu = ["storage-proofs-core/gpu", "filecoin-hashers/gpu", "neptune/opencl", "bellperson/gpu", "fr32/gpu"]
 pairing = ["storage-proofs-core/pairing", "bellperson/pairing", "neptune/pairing", "filecoin-hashers/pairing", "fr32/pairing"]
 blst = ["storage-proofs-core/blst", "bellperson/blst", "neptune/blst", "filecoin-hashers/blst", "fr32/blst"]
 single-threaded = []
diff --git a/storage-proofs-porep/src/stacked/vanilla/proof.rs b/storage-proofs-porep/src/stacked/vanilla/proof.rs
index 7254467db5..289e13e099 100644
--- a/storage-proofs-porep/src/stacked/vanilla/proof.rs
+++ b/storage-proofs-porep/src/stacked/vanilla/proof.rs
@@ -1,14 +1,12 @@
 use std::fs;
 use std::marker::PhantomData;
 use std::path::{Path, PathBuf};
-use std::sync::Mutex;
 
 use anyhow::Context;
 use bincode::deserialize;
 use fdlimit::raise_fd_limit;
 use filecoin_hashers::{Domain, HashFunction, Hasher, PoseidonArity};
 use generic_array::typenum::{Unsigned, U0, U11, U2, U8};
-use lazy_static::lazy_static;
 use log::{error, info, trace};
 use merkletree::{
     merkle::{get_merkle_tree_len, is_merkle_tree_size_valid},
@@ -52,14 +50,6 @@ use crate::{
 
 pub const TOTAL_PARENTS: usize = 37;
 
-lazy_static! {
-    /// Ensure that only one `TreeBuilder` or `ColumnTreeBuilder` uses the GPU at a time.
-    /// Curently, this is accomplished by only instantiating at most one at a time.
-    /// It might be possible to relax this constraint, but in that case, only one builder
-    /// should actually be active at any given time, so the mutex should still be used.
-    static ref GPU_LOCK: Mutex<()> = Mutex::new(());
-}
-
 #[derive(Debug)]
 pub struct StackedDrg<'a, Tree: MerkleTreeTrait, G: Hasher> {
     _a: PhantomData<&'a Tree>,
@@ -368,14 +358,13 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
         let tree = MerkleTree::from_par_iter_with_config(
             (0..leafs)
                 .into_par_iter()
-                // TODO: proper error handling instead of `unwrap()`
                 .map(|i| get_node::<K>(tree_data, i).expect("get_node failure")),
             config,
         )?;
         Ok(tree)
     }
 
-    #[cfg(any(feature = "gpu", feature = "gpu2"))]
+    #[cfg(feature = "gpu")]
     fn generate_tree_c<ColumnArity, TreeArity>(
         layers: usize,
         nodes_count: usize,
@@ -406,7 +395,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
         }
     }
 
-    #[cfg(not(any(feature = "gpu", feature = "gpu2")))]
+    #[cfg(not(feature = "gpu"))]
     fn generate_tree_c<ColumnArity, TreeArity>(
         layers: usize,
         nodes_count: usize,
@@ -428,7 +417,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
     }
 
     #[allow(clippy::needless_range_loop)]
-    #[cfg(any(feature = "gpu", feature = "gpu2"))]
+    #[cfg(feature = "gpu")]
     fn generate_tree_c_gpu<ColumnArity, TreeArity>(
         layers: usize,
         nodes_count: usize,
@@ -441,13 +430,11 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
         TreeArity: PoseidonArity,
     {
         use std::cmp::min;
-        use std::ops::Range;
-        use std::sync::{mpsc::sync_channel, Arc, RwLock};
+        use std::sync::{mpsc::channel, Arc, RwLock};
 
         use bellperson::bls::Fr;
-        use ff::Field;
-        use fr32::fr_into_bytes;
-        use generic_array::{sequence::GenericSequence, GenericArray};
+        use fr32::{bytes_into_fr, fr_into_bytes};
+        use generic_array::GenericArray;
         use merkletree::store::DiskStore;
         use neptune::{
             batch_hasher::BatcherType,
@@ -473,12 +460,12 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
             let column_write_batch_size = SETTINGS.column_write_batch_size as usize;
 
             // This channel will receive batches of columns and add them to the ColumnTreeBuilder.
-            let (builder_tx, builder_rx) = sync_channel(0);
+            let (builder_tx, builder_rx) = channel();
 
             let config_count = configs.len(); // Don't move config into closure below.
             rayon::scope(|s| {
                 // This channel will receive the finished tree data to be written to disk.
-                let (writer_tx, writer_rx) = sync_channel::<(Vec<Fr>, Vec<Fr>)>(0);
+                let (writer_tx, writer_rx) = channel::<(Vec<Fr>, Vec<Fr>)>();
 
                 s.spawn(move |_| {
                     for i in 0..config_count {
@@ -493,14 +480,13 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
                                 tree_count,
                                 chunked_nodes_count,
                             );
-                            let mut columns: Vec<GenericArray<Fr, ColumnArity>> = vec![
-                                GenericArray::<Fr, ColumnArity>::generate(|_i: usize| Fr::zero());
-                                chunked_nodes_count
-                            ];
 
                             // Allocate layer data array and insert a placeholder for each layer.
-                            let mut layer_data: Vec<Vec<Fr>> =
-                                vec![Vec::with_capacity(chunked_nodes_count); layers];
+                            let mut layer_data: Vec<Vec<u8>> =
+                                vec![
+                                    vec![0u8; chunked_nodes_count * std::mem::size_of::<Fr>()];
+                                    layers
+                                ];
 
                             rayon::scope(|s| {
                                 // capture a shadowed version of layer_data.
@@ -508,26 +494,35 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
 
                                 // gather all layer data in parallel.
                                 s.spawn(move |_| {
-                                    for (layer_index, layer_elements) in
+                                    for (layer_index, mut layer_bytes) in
                                         layer_data.iter_mut().enumerate()
                                     {
                                         let store = labels.labels_for_layer(layer_index + 1);
                                         let start = (i * nodes_count) + node_index;
                                         let end = start + chunked_nodes_count;
-                                        let elements: Vec<<Tree::Hasher as Hasher>::Domain> = store
-                                            .read_range(Range { start, end })
+
+                                        store
+                                            .read_range_into(start, end, &mut layer_bytes)
                                             .expect("failed to read store range");
-                                        layer_elements.extend(elements.into_iter().map(Into::into));
                                     }
                                 });
                             });
 
-                            // Copy out all layer data arranged into columns.
-                            for layer_index in 0..layers {
-                                for index in 0..chunked_nodes_count {
-                                    columns[index][layer_index] = layer_data[layer_index][index];
-                                }
-                            }
+                            let columns: Vec<GenericArray<Fr, ColumnArity>> =
+                                (0..chunked_nodes_count)
+                                    .into_par_iter()
+                                    .map(|index| {
+                                        (0..layers)
+                                            .map(|layer_index| {
+                                                bytes_into_fr(
+                                                &layer_data[layer_index][std::mem::size_of::<Fr>()
+                                                    * index
+                                                    ..std::mem::size_of::<Fr>() * (index + 1)],
+                                            ).expect("Could not create Fr from bytes.")
+                                            })
+                                            .collect::<GenericArray<Fr, ColumnArity>>()
+                                    })
+                                    .collect();
 
                             drop(layer_data);
 
@@ -547,11 +542,8 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
                     }
                 });
                 s.spawn(move |_| {
-                    let _gpu_lock = GPU_LOCK.lock().unwrap();
                     let mut column_tree_builder = ColumnTreeBuilder::<ColumnArity, TreeArity>::new(
                         #[cfg(feature = "gpu")]
-                        Some(BatcherType::GPU),
-                        #[cfg(feature = "gpu2")]
                         Some(BatcherType::OpenCL),
                         nodes_count,
                         max_gpu_column_batch_size,
@@ -734,7 +726,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
         })
     }
 
-    #[cfg(any(feature = "gpu", feature = "gpu2"))]
+    #[cfg(feature = "gpu")]
     fn generate_tree_r_last<TreeArity>(
         data: &mut Data<'_>,
         nodes_count: usize,
@@ -767,7 +759,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
         }
     }
 
-    #[cfg(not(any(feature = "gpu", feature = "gpu2")))]
+    #[cfg(not(feature = "gpu"))]
     fn generate_tree_r_last<TreeArity>(
         data: &mut Data<'_>,
         nodes_count: usize,
@@ -789,7 +781,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
         )
     }
 
-    #[cfg(any(feature = "gpu", feature = "gpu2"))]
+    #[cfg(feature = "gpu")]
     fn generate_tree_r_last_gpu<TreeArity>(
         data: &mut Data<'_>,
         nodes_count: usize,
@@ -804,10 +796,10 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
         use std::cmp::min;
         use std::fs::OpenOptions;
         use std::io::Write;
-        use std::sync::mpsc::sync_channel;
+        use std::sync::mpsc::channel;
 
         use bellperson::bls::Fr;
-        use fr32::fr_into_bytes;
+        use fr32::{bytes_into_fr, fr_into_bytes};
         use merkletree::merkle::{get_merkle_tree_cache_size, get_merkle_tree_leafs};
         use neptune::{
             batch_hasher::BatcherType,
@@ -828,13 +820,13 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
         let max_gpu_tree_batch_size = SETTINGS.max_gpu_tree_batch_size as usize;
 
         // This channel will receive batches of leaf nodes and add them to the TreeBuilder.
-        let (builder_tx, builder_rx) = sync_channel::<(Vec<Fr>, bool)>(0);
+        let (builder_tx, builder_rx) = channel::<(Vec<Fr>, bool)>();
         let config_count = configs.len(); // Don't move config into closure below.
         let configs = &configs;
         let tree_r_last_config = &tree_r_last_config;
         rayon::scope(|s| {
             // This channel will receive the finished tree data to be written to disk.
-            let (writer_tx, writer_rx) = sync_channel::<Vec<Fr>>(0);
+            let (writer_tx, writer_rx) = channel::<Vec<Fr>>();
 
             s.spawn(move |_| {
                 for i in 0..config_count {
@@ -855,10 +847,17 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
                             end,
                         );
 
-                        let encoded_data = last_layer_labels
-                            .read_range(start..end)
-                            .expect("failed to read layer range")
+                        let mut layer_bytes = vec![0u8; (end - start) * std::mem::size_of::<Fr>()];
+                        last_layer_labels
+                            .read_range_into(start, end, &mut layer_bytes)
+                            .expect("failed to read layer bytes");
+
+                        let encoded_data = layer_bytes
                             .into_par_iter()
+                            .chunks(std::mem::size_of::<Fr>())
+                            .map(|chunk| {
+                                bytes_into_fr(&chunk).expect("Could not create Fr from bytes.")
+                            })
                             .zip(
                                 data.as_mut()[(start * NODE_SIZE)..(end * NODE_SIZE)]
                                     .par_chunks_mut(NODE_SIZE),
@@ -868,8 +867,11 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
                                     data_node_bytes,
                                 )
                                 .expect("try_from_bytes failed");
-                                let encoded_node =
-                                    encode::<<Tree::Hasher as Hasher>::Domain>(key, data_node);
+
+                                let encoded_node = encode::<<Tree::Hasher as Hasher>::Domain>(
+                                    key.into(),
+                                    data_node,
+                                );
                                 data_node_bytes
                                     .copy_from_slice(AsRef::<[u8]>::as_ref(&encoded_node));
 
@@ -895,11 +897,8 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
                 }
             });
             s.spawn(move |_| {
-                let _gpu_lock = GPU_LOCK.lock().unwrap();
                 let mut tree_builder = TreeBuilder::<Tree::Arity>::new(
                     #[cfg(feature = "gpu")]
-                    Some(BatcherType::GPU),
-                    #[cfg(feature = "gpu2")]
                     Some(BatcherType::OpenCL),
                     nodes_count,
                     max_gpu_tree_batch_size,
@@ -1312,7 +1311,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
     // Assumes data is all zeros.
     // Replica path is used to create configs, but is not read.
     // Instead new zeros are provided (hence the need for replica to be all zeros).
-    #[cfg(any(feature = "gpu", feature = "gpu2"))]
+    #[cfg(feature = "gpu")]
     fn generate_fake_tree_r_last<TreeArity>(
         nodes_count: usize,
         tree_count: usize,
@@ -1345,11 +1344,8 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
             info!("generating tree r last using the GPU");
             let max_gpu_tree_batch_size = SETTINGS.max_gpu_tree_batch_size as usize;
 
-            let _gpu_lock = GPU_LOCK.lock().unwrap();
             let mut tree_builder = TreeBuilder::<Tree::Arity>::new(
                 #[cfg(feature = "gpu")]
-                Some(BatcherType::GPU),
-                #[cfg(feature = "gpu2")]
                 Some(BatcherType::OpenCL),
                 nodes_count,
                 max_gpu_tree_batch_size,
@@ -1445,7 +1441,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
     // Assumes data is all zeros.
     // Replica path is used to create configs, but is not read.
     // Instead new zeros are provided (hence the need for replica to be all zeros).
-    #[cfg(not(any(feature = "gpu", feature = "gpu2")))]
+    #[cfg(not(feature = "gpu"))]
     fn generate_fake_tree_r_last<TreeArity>(
         nodes_count: usize,
         tree_count: usize,
diff --git a/storage-proofs-post/Cargo.toml b/storage-proofs-post/Cargo.toml
index 29d9729a57..b88034ac8c 100644
--- a/storage-proofs-post/Cargo.toml
+++ b/storage-proofs-post/Cargo.toml
@@ -39,6 +39,5 @@ filecoin-hashers = { path = "../filecoin-hashers", version = "1.0.0", default-fe
 [features]
 default = ["pairing", "gpu"]
 gpu = ["storage-proofs-core/gpu", "filecoin-hashers/gpu", "fr32/gpu"]
-gpu2 = ["storage-proofs-core/gpu2", "filecoin-hashers/gpu2", "fr32/gpu"]
 pairing = ["storage-proofs-core/pairing", "bellperson/pairing", "neptune/pairing", "filecoin-hashers/pairing", "fr32/pairing"]
 blst = ["storage-proofs-core/blst", "bellperson/blst", "neptune/blst", "filecoin-hashers/blst", "fr32/blst"]