Skip to content

Commit

Permalink
Optimize opencl and make it default gpu feature.
Browse files Browse the repository at this point in the history
  • Loading branch information
porcuquine committed Feb 25, 2021
1 parent 64390b6 commit 3faf8ec
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 72 deletions.
1 change: 0 additions & 1 deletion fil-proofs-param/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,5 @@ heap-profile = ["gperftools/heap"]
simd = ["storage-proofs-core/simd"]
asm = ["storage-proofs-core/asm"]
gpu = ["storage-proofs-core/gpu", "storage-proofs-porep/gpu", "storage-proofs-post/gpu", "bellperson/gpu"]
gpu2 = ["storage-proofs-core/gpu2", "storage-proofs-porep/gpu2", "storage-proofs-post/gpu2", "bellperson/gpu"]
pairing = ["storage-proofs-core/pairing", "storage-proofs-porep/pairing", "storage-proofs-post/pairing", "bellperson/pairing"]
blst = ["storage-proofs-core/blst", "storage-proofs-porep/blst", "storage-proofs-post/blst", "bellperson/blst"]
8 changes: 0 additions & 8 deletions fil-proofs-tooling/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,6 @@ gpu = [
"bellperson/gpu",
"filecoin-hashers/gpu",
]
gpu2 = [
"storage-proofs-core/gpu2",
"storage-proofs-porep/gpu2",
"storage-proofs-post/gpu2",
"filecoin-proofs/gpu2",
"bellperson/gpu",
"filecoin-hashers/gpu2",
]
measurements = ["storage-proofs-core/measurements"]
profile = ["storage-proofs-core/profile", "measurements"]
pairing = [
Expand Down
3 changes: 1 addition & 2 deletions filecoin-hashers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ hex = "0.4.2"
[features]
default = ["gpu", "pairing", "blake2s", "poseidon", "sha256"]

gpu = ["bellperson/gpu", "neptune/gpu"]
gpu2 = ["bellperson/gpu", "neptune/opencl"]
gpu = ["bellperson/gpu", "neptune/opencl"]

pairing = ["bellperson/pairing", "neptune/pairing", "bellperson/pairing-serde"]
blst = ["bellperson/blst", "neptune/blst", "bellperson/blst-serde"]
Expand Down
8 changes: 0 additions & 8 deletions filecoin-proofs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,6 @@ gpu = [
"filecoin-hashers/gpu",
"fr32/gpu",
]
gpu2 = [
"storage-proofs-core/gpu2",
"storage-proofs-porep/gpu2",
"storage-proofs-post/gpu2",
"bellperson/gpu",
"filecoin-hashers/gpu2",
"fr32/gpu",
]
pairing = [
"storage-proofs-core/pairing",
"storage-proofs-porep/pairing",
Expand Down
3 changes: 1 addition & 2 deletions storage-proofs-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ big-sector-sizes-bench = []
measurements = ["cpu-time", "gperftools"]
profile = ["measurements"]

gpu = ["bellperson/gpu", "neptune/gpu", "filecoin-hashers/gpu", "fr32/gpu"]
gpu2 = ["bellperson/gpu", "neptune/opencl", "filecoin-hashers/gpu2", "fr32/gpu"]
gpu = ["bellperson/gpu", "neptune/opencl", "filecoin-hashers/gpu", "fr32/gpu"]
pairing = ["bellperson/pairing", "neptune/pairing", "bellperson/pairing-serde", "filecoin-hashers/pairing", "fr32/pairing"]
blst = ["bellperson/blst", "neptune/blst", "bellperson/blst-serde", "filecoin-hashers/blst", "fr32/blst"]

Expand Down
3 changes: 1 addition & 2 deletions storage-proofs-porep/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ filecoin-hashers = { path = "../filecoin-hashers", version = "1.0.0", default-fe

[features]
default = ["pairing", "gpu"]
gpu = ["storage-proofs-core/gpu", "filecoin-hashers/gpu", "neptune/gpu", "bellperson/gpu", "fr32/gpu"]
gpu2 = ["storage-proofs-core/gpu2", "filecoin-hashers/gpu2", "neptune/opencl", "bellperson/gpu", "fr32/gpu"]
gpu = ["storage-proofs-core/gpu", "filecoin-hashers/gpu", "neptune/opencl", "bellperson/gpu", "fr32/gpu"]
pairing = ["storage-proofs-core/pairing", "bellperson/pairing", "neptune/pairing", "filecoin-hashers/pairing", "fr32/pairing"]
blst = ["storage-proofs-core/blst", "bellperson/blst", "neptune/blst", "filecoin-hashers/blst", "fr32/blst"]
single-threaded = []
Expand Down
106 changes: 58 additions & 48 deletions storage-proofs-porep/src/stacked/vanilla/proof.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
use std::fs;
use std::marker::PhantomData;
use std::path::{Path, PathBuf};
use std::sync::Mutex;

use anyhow::Context;
use bincode::deserialize;
use fdlimit::raise_fd_limit;
use filecoin_hashers::{Domain, HashFunction, Hasher, PoseidonArity};
use generic_array::typenum::{Unsigned, U0, U11, U2, U8};
use lazy_static::lazy_static;
use log::{error, info, trace};
use merkletree::{
merkle::{get_merkle_tree_len, is_merkle_tree_size_valid},
Expand Down Expand Up @@ -52,14 +50,6 @@ use crate::{

pub const TOTAL_PARENTS: usize = 37;

lazy_static! {
/// Ensure that only one `TreeBuilder` or `ColumnTreeBuilder` uses the GPU at a time.
/// Curently, this is accomplished by only instantiating at most one at a time.
/// It might be possible to relax this constraint, but in that case, only one builder
/// should actually be active at any given time, so the mutex should still be used.
static ref GPU_LOCK: Mutex<()> = Mutex::new(());
}

#[derive(Debug)]
pub struct StackedDrg<'a, Tree: MerkleTreeTrait, G: Hasher> {
_a: PhantomData<&'a Tree>,
Expand Down Expand Up @@ -368,14 +358,13 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
let tree = MerkleTree::from_par_iter_with_config(
(0..leafs)
.into_par_iter()
// TODO: proper error handling instead of `unwrap()`
.map(|i| get_node::<K>(tree_data, i).expect("get_node failure")),
config,
)?;
Ok(tree)
}

#[cfg(any(feature = "gpu", feature = "gpu2"))]
#[cfg(feature = "gpu")]
fn generate_tree_c<ColumnArity, TreeArity>(
layers: usize,
nodes_count: usize,
Expand Down Expand Up @@ -406,7 +395,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
}
}

#[cfg(not(any(feature = "gpu", feature = "gpu2")))]
#[cfg(not(feature = "gpu"))]
fn generate_tree_c<ColumnArity, TreeArity>(
layers: usize,
nodes_count: usize,
Expand All @@ -428,7 +417,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
}

#[allow(clippy::needless_range_loop)]
#[cfg(any(feature = "gpu", feature = "gpu2"))]
#[cfg(feature = "gpu")]
fn generate_tree_c_gpu<ColumnArity, TreeArity>(
layers: usize,
nodes_count: usize,
Expand All @@ -441,8 +430,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
TreeArity: PoseidonArity,
{
use std::cmp::min;
use std::ops::Range;
use std::sync::{mpsc::sync_channel, Arc, RwLock};
use std::sync::{mpsc::channel, Arc, RwLock};

use bellperson::bls::Fr;
use ff::Field;
Expand Down Expand Up @@ -473,12 +461,12 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
let column_write_batch_size = SETTINGS.column_write_batch_size as usize;

// This channel will receive batches of columns and add them to the ColumnTreeBuilder.
let (builder_tx, builder_rx) = sync_channel(0);
let (builder_tx, builder_rx) = channel();

let config_count = configs.len(); // Don't move config into closure below.
rayon::scope(|s| {
// This channel will receive the finished tree data to be written to disk.
let (writer_tx, writer_rx) = sync_channel::<(Vec<Fr>, Vec<Fr>)>(0);
let (writer_tx, writer_rx) = channel::<(Vec<Fr>, Vec<Fr>)>();

s.spawn(move |_| {
for i in 0..config_count {
Expand All @@ -499,33 +487,47 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
];

// Allocate layer data array and insert a placeholder for each layer.
let mut layer_data: Vec<Vec<Fr>> =
vec![Vec::with_capacity(chunked_nodes_count); layers];
let mut layer_data: Vec<Vec<u8>> =
vec![
vec![0u8; chunked_nodes_count * std::mem::size_of::<Fr>()];
layers
];

rayon::scope(|s| {
// capture a shadowed version of layer_data.
let layer_data: &mut Vec<_> = &mut layer_data;

// gather all layer data in parallel.
s.spawn(move |_| {
for (layer_index, layer_elements) in
for (layer_index, mut layer_bytes) in
layer_data.iter_mut().enumerate()
{
let store = labels.labels_for_layer(layer_index + 1);
let start = (i * nodes_count) + node_index;
let end = start + chunked_nodes_count;
let elements: Vec<<Tree::Hasher as Hasher>::Domain> = store
.read_range(Range { start, end })

store
.read_range_into(start, end, &mut layer_bytes)
.expect("failed to read store range");
layer_elements.extend(elements.into_iter().map(Into::into));
}
});
});

// Copy out all layer data arranged into columns.
let mut buf = [0u8; std::mem::size_of::<Fr>()];
for layer_index in 0..layers {
for index in 0..chunked_nodes_count {
columns[index][layer_index] = layer_data[layer_index][index];
buf.copy_from_slice(
&layer_data[layer_index][std::mem::size_of::<Fr>() * index
..std::mem::size_of::<Fr>() * (index + 1)],
);
let fr = unsafe {
// SAFETY: We know the underlying elements of the layers in `LabelsCache`
// were stored on disk with the same memory layout as `Fr`.
std::mem::transmute::<[u8; std::mem::size_of::<Fr>()], Fr>(
buf,
)
};
columns[index][layer_index] = fr;
}
}

Expand All @@ -547,11 +549,8 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
}
});
s.spawn(move |_| {
let _gpu_lock = GPU_LOCK.lock().unwrap();
let mut column_tree_builder = ColumnTreeBuilder::<ColumnArity, TreeArity>::new(
#[cfg(feature = "gpu")]
Some(BatcherType::GPU),
#[cfg(feature = "gpu2")]
Some(BatcherType::OpenCL),
nodes_count,
max_gpu_column_batch_size,
Expand Down Expand Up @@ -734,7 +733,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
})
}

#[cfg(any(feature = "gpu", feature = "gpu2"))]
#[cfg(feature = "gpu")]
fn generate_tree_r_last<TreeArity>(
data: &mut Data<'_>,
nodes_count: usize,
Expand Down Expand Up @@ -767,7 +766,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
}
}

#[cfg(not(any(feature = "gpu", feature = "gpu2")))]
#[cfg(not(feature = "gpu"))]
fn generate_tree_r_last<TreeArity>(
data: &mut Data<'_>,
nodes_count: usize,
Expand All @@ -789,7 +788,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
)
}

#[cfg(any(feature = "gpu", feature = "gpu2"))]
#[cfg(feature = "gpu")]
fn generate_tree_r_last_gpu<TreeArity>(
data: &mut Data<'_>,
nodes_count: usize,
Expand All @@ -804,7 +803,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
use std::cmp::min;
use std::fs::OpenOptions;
use std::io::Write;
use std::sync::mpsc::sync_channel;
use std::sync::mpsc::channel;

use bellperson::bls::Fr;
use fr32::fr_into_bytes;
Expand All @@ -828,13 +827,13 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
let max_gpu_tree_batch_size = SETTINGS.max_gpu_tree_batch_size as usize;

// This channel will receive batches of leaf nodes and add them to the TreeBuilder.
let (builder_tx, builder_rx) = sync_channel::<(Vec<Fr>, bool)>(0);
let (builder_tx, builder_rx) = channel::<(Vec<Fr>, bool)>();
let config_count = configs.len(); // Don't move config into closure below.
let configs = &configs;
let tree_r_last_config = &tree_r_last_config;
rayon::scope(|s| {
// This channel will receive the finished tree data to be written to disk.
let (writer_tx, writer_rx) = sync_channel::<Vec<Fr>>(0);
let (writer_tx, writer_rx) = channel::<Vec<Fr>>();

s.spawn(move |_| {
for i in 0..config_count {
Expand All @@ -855,10 +854,24 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
end,
);

let encoded_data = last_layer_labels
.read_range(start..end)
.expect("failed to read layer range")
let mut layer_bytes = vec![0u8; (end - start) * std::mem::size_of::<Fr>()];
last_layer_labels
.read_range_into(start, end, &mut layer_bytes)
.expect("failed to read layer bytes");

let encoded_data = layer_bytes
.into_par_iter()
.chunks(std::mem::size_of::<Fr>())
.map(|chunk| {
let mut buf = [0u8; std::mem::size_of::<Fr>()];
buf.copy_from_slice(&chunk);

unsafe {
// SAFETY: We know the underlying elements of the layer in `LabelsCache`
// were stored on disk with the same memory layout as `Fr`.
std::mem::transmute::<[u8; std::mem::size_of::<Fr>()], Fr>(buf)
}
})
.zip(
data.as_mut()[(start * NODE_SIZE)..(end * NODE_SIZE)]
.par_chunks_mut(NODE_SIZE),
Expand All @@ -868,8 +881,11 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
data_node_bytes,
)
.expect("try_from_bytes failed");
let encoded_node =
encode::<<Tree::Hasher as Hasher>::Domain>(key, data_node);

let encoded_node = encode::<<Tree::Hasher as Hasher>::Domain>(
key.into(),
data_node,
);
data_node_bytes
.copy_from_slice(AsRef::<[u8]>::as_ref(&encoded_node));

Expand All @@ -895,11 +911,8 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
}
});
s.spawn(move |_| {
let _gpu_lock = GPU_LOCK.lock().unwrap();
let mut tree_builder = TreeBuilder::<Tree::Arity>::new(
#[cfg(feature = "gpu")]
Some(BatcherType::GPU),
#[cfg(feature = "gpu2")]
Some(BatcherType::OpenCL),
nodes_count,
max_gpu_tree_batch_size,
Expand Down Expand Up @@ -1312,7 +1325,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
// Assumes data is all zeros.
// Replica path is used to create configs, but is not read.
// Instead new zeros are provided (hence the need for replica to be all zeros).
#[cfg(any(feature = "gpu", feature = "gpu2"))]
#[cfg(feature = "gpu")]
fn generate_fake_tree_r_last<TreeArity>(
nodes_count: usize,
tree_count: usize,
Expand Down Expand Up @@ -1345,11 +1358,8 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
info!("generating tree r last using the GPU");
let max_gpu_tree_batch_size = SETTINGS.max_gpu_tree_batch_size as usize;

let _gpu_lock = GPU_LOCK.lock().unwrap();
let mut tree_builder = TreeBuilder::<Tree::Arity>::new(
#[cfg(feature = "gpu")]
Some(BatcherType::GPU),
#[cfg(feature = "gpu2")]
Some(BatcherType::OpenCL),
nodes_count,
max_gpu_tree_batch_size,
Expand Down Expand Up @@ -1445,7 +1455,7 @@ impl<'a, Tree: 'static + MerkleTreeTrait, G: 'static + Hasher> StackedDrg<'a, Tr
// Assumes data is all zeros.
// Replica path is used to create configs, but is not read.
// Instead new zeros are provided (hence the need for replica to be all zeros).
#[cfg(not(any(feature = "gpu", feature = "gpu2")))]
#[cfg(not(feature = "gpu"))]
fn generate_fake_tree_r_last<TreeArity>(
nodes_count: usize,
tree_count: usize,
Expand Down
1 change: 0 additions & 1 deletion storage-proofs-post/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,5 @@ filecoin-hashers = { path = "../filecoin-hashers", version = "1.0.0", default-fe
[features]
default = ["pairing", "gpu"]
gpu = ["storage-proofs-core/gpu", "filecoin-hashers/gpu", "fr32/gpu"]
gpu2 = ["storage-proofs-core/gpu2", "filecoin-hashers/gpu2", "fr32/gpu"]
pairing = ["storage-proofs-core/pairing", "bellperson/pairing", "neptune/pairing", "filecoin-hashers/pairing", "fr32/pairing"]
blst = ["storage-proofs-core/blst", "bellperson/blst", "neptune/blst", "filecoin-hashers/blst", "fr32/blst"]

0 comments on commit 3faf8ec

Please sign in to comment.