Skip to content

Commit

Permalink
refactor(bb): IPA parallelization cleanup (#8088)
Browse files Browse the repository at this point in the history
Small cleanup to IPA and protogalaxy parallelization

- introduce a helper for when we're accumulating variables in loops in a
thread-safe manner, replace error-prone and inefficient mutexes where
they were introduced in IPA
- get rid of some << 1 and >> 1, no reason to do this unless we're
treating numbers as bit-sets, compiler is smart
  • Loading branch information
ludamad authored Aug 20, 2024
1 parent a09627c commit 9227fa9
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 75 deletions.
56 changes: 18 additions & 38 deletions barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
#include "barretenberg/commitment_schemes/claim.hpp"
#include "barretenberg/commitment_schemes/verification_key.hpp"
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/container.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
#include "barretenberg/transcript/transcript.hpp"
#include <cstddef>
#include <numeric>
#include <string>
#include <utility>
#include <vector>

namespace bb {
Expand Down Expand Up @@ -168,10 +170,8 @@ template <typename Curve_> class IPA {
// G_vec_local should use only the original SRS thus we extract only the even indices.
parallel_for_heuristic(
poly_length,
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
[&](size_t i) {
G_vec_local[i] = srs_elements[i * 2];
}, thread_heuristics::FF_COPY_COST);

// Step 5.
Expand All @@ -196,38 +196,22 @@ template <typename Curve_> class IPA {
GroupElement R_i;
std::size_t round_size = poly_length;

#ifndef NO_MULTITHREADING
// The inner products we'll be computing in parallel need a mutex to be thread-safe during the last
// accumulation
std::mutex inner_product_accumulation_mutex;
#endif
// Step 6.
// Perform IPA reduction rounds
for (size_t i = 0; i < log_poly_degree; i++) {
round_size >>= 1;
// Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
Fr inner_prod_L = Fr::zero();
Fr inner_prod_R = Fr::zero();
round_size /= 2;
// Run scalar products in parallel
parallel_for_heuristic(
auto inner_prods = parallel_for_heuristic(
round_size,
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
Fr current_inner_prod_L = Fr::zero();
Fr current_inner_prod_R = Fr::zero();
for (size_t j = start; j < end; j++) {
current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
}
// Update the accumulated results thread-safely
{
#ifndef NO_MULTITHREADING
std::unique_lock<std::mutex> lock(inner_product_accumulation_mutex);
#endif
inner_prod_L += current_inner_prod_L;
inner_prod_R += current_inner_prod_R;
}
std::pair{Fr::zero(), Fr::zero()},
[&](size_t j, std::pair<Fr, Fr>& inner_prod_left_right) {
// Compute inner_prod_L := < a_vec_lo, b_vec_hi >
inner_prod_left_right.first += a_vec[j] * b_vec[round_size + j];
// Compute inner_prod_R := < a_vec_hi, b_vec_lo >
inner_prod_left_right.second += a_vec[round_size + j] * b_vec[j];
}, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);

// Sum inner product contributions computed in parallel and unpack the std::pair
auto [inner_prod_L, inner_prod_R] = sum_pairs(inner_prods);
// Step 6.a (using letters, because doxygen automaticall converts the sublist counters to letters :( )
// L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
L_i = bb::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
Expand Down Expand Up @@ -376,23 +360,21 @@ template <typename Curve_> class IPA {

// Step 7.
// Construct vector s
std::vector<Fr> s_vec(poly_length);
std::vector<Fr> s_vec(poly_length, Fr::one());

// TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its
// O(nlogn). This can be optimized to be linear by computing a tree of products. Its very readable, so we're
// leaving it unoptimized for now.
parallel_for_heuristic(
poly_length,
[&](size_t i) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
s_vec[i] *= round_challenges_inv[log_poly_degree - 1 - j];
}
}
s_vec[i] = s_vec_scalar;
}, thread_heuristics::FF_MULTIPLICATION_COST * log_poly_degree);

auto* srs_elements = vk->get_monomial_points();
Expand All @@ -405,10 +387,8 @@ template <typename Curve_> class IPA {
// G_vec_local should use only the original SRS thus we extract only the even indices.
parallel_for_heuristic(
poly_length,
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
[&](size_t i) {
G_vec_local[i] = srs_elements[i * 2];
}, thread_heuristics::FF_COPY_COST * 2);

// Step 8.
Expand Down
2 changes: 2 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,6 @@
#define BB_UNLIKELY(x) x
#endif

// Opinionated feature: functionally equivalent to [[maybe_unused]] but clearly
// marks things DEFINITELY unused. Aims to be more readable, at the tradeoff of being a custom thingy.
#define BB_UNUSED [[maybe_unused]]
29 changes: 28 additions & 1 deletion barretenberg/cpp/src/barretenberg/common/container.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <string>
#include <vector>

namespace bb {

template <typename C> C slice(C const& container, size_t start)
{
auto b = container.begin();
Expand Down Expand Up @@ -61,4 +63,29 @@ template <typename T> int64_t index_of(std::vector<T> const& vec, T const& item)
auto const& itr = std::find(begin, end, item);

return itr == end ? -1 : std::distance(begin, itr);
}
}

// A simple sum meant for small containers (i.e. doesn't use threading)
template <template <typename, typename...> typename Cont, typename Inner, typename... Args>
Inner sum(Cont<Inner, Args...> const& in)
{
Inner result{};
for (auto& e : in) {
result += e;
}
return result;
}

// A simple sum meant for small containers (i.e. doesn't use threading)
template <template <typename, typename...> typename Cont, typename Left, typename Right, typename... Args>
std::pair<Left, Right> sum_pairs(Cont<std::pair<Left, Right>, Args...> const& in)
{
std::pair<Left, Right> result{ {}, {} };
for (auto& e : in) {
result.first += e.first;
result.second += e.second;
}
return result;
}

} // namespace bb
28 changes: 28 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,32 @@ void parallel_for_heuristic(size_t num_points, const Func& func, size_t heuristi
heuristic_cost);
}

/**
* @brief parallel_for_heuristic variant that takes an accumulator initializer
* that is allocated in a vector, one accumulator per thread/chunk.
* This allows for thread-safe accumulation, see sum() or sum_pairs() in container.hpp
* for an easy way to combine the thread/chunk contributions into a final result.
*/
template <typename Func, typename Accum>
requires std::invocable<Func, std::size_t, Accum&>
std::vector<Accum> parallel_for_heuristic(size_t num_points,
const Accum& initial_accum,
const Func& func,
size_t heuristic_cost)
{
// thread-safe accumulators
std::vector<Accum> accumulators(get_num_cpus(), initial_accum);
parallel_for_heuristic(
num_points,
[&](size_t start_idx, size_t end_idx, size_t chunk_index) {
for (size_t i = start_idx; i < end_idx; i++) {
func(i, accumulators[chunk_index]);
}
},
heuristic_cost);
return accumulators;
}

const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;

/**
Expand Down Expand Up @@ -100,6 +126,8 @@ constexpr size_t GE_DOUBLING_COST = 194;
constexpr size_t SM_COST = 50000;
// Field element (16 byte) sequential copy number
constexpr size_t FF_COPY_COST = 3;
// Fine default if something looks 'chunky enough that I don't want to calculate'
constexpr size_t ALWAYS_MULTITHREAD = 100000;
} // namespace thread_heuristics

} // namespace bb
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#pragma once
#include "barretenberg/common/container.hpp"
#include "barretenberg/common/op_count.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/flavor/flavor.hpp"
#include "barretenberg/ultra_honk/oink_prover.hpp"
#include "protogalaxy_prover.hpp"
Expand All @@ -10,16 +13,13 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
const RelationSeparator& alpha,
const RelationParameters<FF>& relation_parameters)
{
BB_OP_COUNT_TIME_NAME("ProtoGalaxyProver_::compute_full_honk_evaluations");
auto instance_size = instance_polynomials.get_polynomial_size();
std::vector<FF> full_honk_evaluations(instance_size);
std::vector<FF> linearly_dependent_contributions(instance_size);
#ifndef NO_MULTITHREADING
std::mutex evaluation_mutex;
#endif
auto linearly_dependent_contribution_accumulator = FF(0);
parallel_for_range(instance_size, [&](size_t start_row, size_t end_row) {
auto thread_accumulator = FF(0);
for (size_t row = start_row; row < end_row; row++) {
std::vector<FF> linearly_dependent_contribution_accumulators = parallel_for_heuristic(
instance_size,
/*accumulator default*/ FF(0),
[&](size_t row, FF& linearly_dependent_contribution_accumulator) {
auto row_evaluations = instance_polynomials.get_row(row);
RelationEvaluations relation_evaluations;
Utils::zero_elements(relation_evaluations);
Expand All @@ -29,19 +29,13 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver

auto output = FF(0);
auto running_challenge = FF(1);
auto linearly_dependent_contribution = FF(0);
Utils::scale_and_batch_elements(
relation_evaluations, alpha, running_challenge, output, linearly_dependent_contribution);
thread_accumulator += linearly_dependent_contribution;
relation_evaluations, alpha, running_challenge, output, linearly_dependent_contribution_accumulator);

full_honk_evaluations[row] = output;
}
#ifndef NO_MULTITHREADING
std::unique_lock<std::mutex> evaluation_lock(evaluation_mutex);
#endif
linearly_dependent_contribution_accumulator += thread_accumulator;
});
full_honk_evaluations[0] += linearly_dependent_contribution_accumulator;
},
thread_heuristics::ALWAYS_MULTITHREAD);
full_honk_evaluations[0] += sum(linearly_dependent_contribution_accumulators);
return full_honk_evaluations;
}

Expand All @@ -59,20 +53,18 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver

auto degree = level + 1;
auto prev_level_width = prev_level_coeffs.size();
std::vector<std::vector<FF>> level_coeffs(prev_level_width >> 1, std::vector<FF>(degree + 1, 0));
parallel_for_range(
prev_level_width >> 1,
[&](size_t start, size_t end) {
for (size_t node = start << 1; node < end << 1; node += 2) {
auto parent = node >> 1;
std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin());
for (size_t d = 0; d < degree; d++) {
level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level];
level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level];
}
std::vector<std::vector<FF>> level_coeffs(prev_level_width / 2, std::vector<FF>(degree + 1, 0));
parallel_for_heuristic(
prev_level_width / 2,
[&](size_t parent) {
size_t node = parent * 2;
std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin());
for (size_t d = 0; d < degree; d++) {
level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level];
level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level];
}
},
/*no_multhreading_if_less_or_equal=*/8);
/* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * degree * 3);
return construct_coefficients_tree(betas, deltas, level_coeffs, level + 1);
}

Expand All @@ -84,14 +76,15 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
const std::vector<FF>& full_honk_evaluations)
{
auto width = full_honk_evaluations.size();
std::vector<std::vector<FF>> first_level_coeffs(width >> 1, std::vector<FF>(2, 0));
parallel_for_range(width >> 1, [&](size_t start, size_t end) {
for (size_t node = start << 1; node < end << 1; node += 2) {
auto parent = node >> 1;
std::vector<std::vector<FF>> first_level_coeffs(width / 2, std::vector<FF>(2, 0));
parallel_for_heuristic(
width / 2,
[&](size_t parent) {
size_t node = parent * 2;
first_level_coeffs[parent][0] = full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0];
first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0];
}
});
},
/* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * 3);
return construct_coefficients_tree(betas, deltas, first_level_coeffs);
}

Expand Down

0 comments on commit 9227fa9

Please sign in to comment.