Skip to content

Commit

Permalink
chore: Remove mutex dependency (AztecProtocol#4160)
Browse files Browse the repository at this point in the history
Removes mutex in IPA opening. Adds a function which can split loop into
parallel threads and give them indices for working with memory
structures
  • Loading branch information
Rumata888 authored Jan 19, 2024
1 parent 9de0b96 commit 701688e
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 29 deletions.
24 changes: 16 additions & 8 deletions barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ template <typename Curve> class IPA {
transcript->send_to_verifier("IPA:poly_degree", static_cast<uint64_t>(poly_degree));
const Fr generator_challenge = transcript->get_challenge("IPA:generator_challenge");
auto aux_generator = Commitment::one() * generator_challenge;

// Checks poly_degree is greater than zero and a power of two
// In the future, we might want to consider if non-powers of two are needed
ASSERT((poly_degree > 0) && (!(poly_degree & (poly_degree - 1))) &&
Expand Down Expand Up @@ -90,30 +89,39 @@ template <typename Curve> class IPA {
std::vector<GroupElement> R_elements(log_poly_degree);
std::size_t round_size = poly_degree;

// Allocate vectors for parallel storage of partial products
const size_t num_cpus = get_num_cpus();
std::vector<Fr> partial_inner_prod_L(num_cpus);
std::vector<Fr> partial_inner_prod_R(num_cpus);
// Perform IPA rounds
for (size_t i = 0; i < log_poly_degree; i++) {
round_size >>= 1;
// Set partial products to zero
memset(&partial_inner_prod_L[0], 0, sizeof(Fr) * num_cpus);
memset(&partial_inner_prod_R[0], 0, sizeof(Fr) * num_cpus);
// Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
std::mutex addition_lock;
Fr inner_prod_L = Fr::zero();
Fr inner_prod_R = Fr::zero();
// Run scalar product in parallel
run_loop_in_parallel_if_effective(
run_loop_in_parallel_if_effective_with_index(
round_size,
[&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) {
[&a_vec, &b_vec, round_size, &partial_inner_prod_L, &partial_inner_prod_R](
size_t start, size_t end, size_t workload_index) {
Fr current_inner_prod_L = Fr::zero();
Fr current_inner_prod_R = Fr::zero();
for (size_t j = start; j < end; j++) {
current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
}
addition_lock.lock();
inner_prod_L += current_inner_prod_L;
inner_prod_R += current_inner_prod_R;
addition_lock.unlock();
partial_inner_prod_L[workload_index] = current_inner_prod_L;
partial_inner_prod_R[workload_index] = current_inner_prod_R;
},
/*finite_field_additions_per_iteration=*/2,
/*finite_field_multiplications_per_iteration=*/2);
for (size_t j = 0; j < num_cpus; j++) {
inner_prod_L += partial_inner_prod_L[j];
inner_prod_R += partial_inner_prod_R[j];
}

// L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
L_elements[i] = bb::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
Expand Down
44 changes: 32 additions & 12 deletions barretenberg/cpp/src/barretenberg/common/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ void run_loop_in_parallel(size_t num_points,
* @param num_points Total number of elements
* @param func A function or lambda expression with a for loop inside, for example:
* [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
* Or for the version with index:
* [](size_t start, size_t end,size_t workload_index){for (size_t i=start; i<end; i++){(void)i;}}
* @param finite_field_additions_per_iteration The number of additions/subtractions/negations
* @param finite_field_multiplications_per_iteration The number of finite field multiplications and squarings
* @param finite_field_inversions_per_iteration
Expand All @@ -144,15 +146,18 @@ void run_loop_in_parallel(size_t num_points,
* @param scalar_multiplications_per_iteration
* @param sequential_copy_ops_per_iteration Field element (16 byte) sequential copy number
*/
void run_loop_in_parallel_if_effective(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t finite_field_additions_per_iteration,
size_t finite_field_multiplications_per_iteration,
size_t finite_field_inversions_per_iteration,
size_t group_element_additions_per_iteration,
size_t group_element_doublings_per_iteration,
size_t scalar_multiplications_per_iteration,
size_t sequential_copy_ops_per_iteration)
template <typename FunctionType>
requires(std::is_same_v<FunctionType, std::function<void(size_t, size_t)>> ||
std::is_same_v<FunctionType, std::function<void(size_t, size_t, size_t)>>)
void run_loop_in_parallel_if_effective_internal(size_t num_points,
const FunctionType& func,
size_t finite_field_additions_per_iteration,
size_t finite_field_multiplications_per_iteration,
size_t finite_field_inversions_per_iteration,
size_t group_element_additions_per_iteration,
size_t group_element_doublings_per_iteration,
size_t scalar_multiplications_per_iteration,
size_t sequential_copy_ops_per_iteration)
{
// Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds):
constexpr size_t FF_ADDITION_COST = 4;
Expand Down Expand Up @@ -185,7 +190,12 @@ void run_loop_in_parallel_if_effective(size_t num_points,

// If starting parallel for is longer than computing, just compute
if (offset_cost < PARALLEL_FOR_COST) {
func(0, num_points);
if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {

func(0, num_points);
} else {
func(0, num_points, 0);
}
return;
}
// Parallelize over chunks
Expand All @@ -201,6 +211,16 @@ void run_loop_in_parallel_if_effective(size_t num_points,
}
size_t start = chunk_index * chunk_size;
size_t end = chunk_index * chunk_size + current_chunk_size;
func(start, end);

if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {

func(start, end);
} else {
func(start, end, chunk_index);
}
});
};
};
template void run_loop_in_parallel_if_effective_internal(
size_t, const std::function<void(size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
template void run_loop_in_parallel_if_effective_internal(
size_t, const std::function<void(size_t, size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
73 changes: 64 additions & 9 deletions barretenberg/cpp/src/barretenberg/common/thread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,67 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
void run_loop_in_parallel(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal = 0);
void run_loop_in_parallel_if_effective(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t finite_field_additions_per_iteration = 0,
size_t finite_field_multiplications_per_iteration = 0,
size_t finite_field_inversions_per_iteration = 0,
size_t group_element_additions_per_iteration = 0,
size_t group_element_doublings_per_iteration = 0,
size_t scalar_multiplications_per_iteration = 0,
size_t sequential_copy_ops_per_iteration = 0);

template <typename FunctionType>
requires(std::is_same_v<FunctionType, std::function<void(size_t, size_t)>> ||
std::is_same_v<FunctionType, std::function<void(size_t, size_t, size_t)>>)
void run_loop_in_parallel_if_effective_internal(
size_t, const FunctionType&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
/**
* @brief Runs loop in parallel if parallelization if useful (costs less than the algorith)
*
* @details Please see run_loop_in_parallel_if_effective_internal for detailed description
*
*/
inline void run_loop_in_parallel_if_effective(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t finite_field_additions_per_iteration = 0,
size_t finite_field_multiplications_per_iteration = 0,
size_t finite_field_inversions_per_iteration = 0,
size_t group_element_additions_per_iteration = 0,
size_t group_element_doublings_per_iteration = 0,
size_t scalar_multiplications_per_iteration = 0,
size_t sequential_copy_ops_per_iteration = 0

)
{
run_loop_in_parallel_if_effective_internal(num_points,
func,
finite_field_additions_per_iteration,
finite_field_multiplications_per_iteration,
finite_field_inversions_per_iteration,
group_element_additions_per_iteration,
group_element_doublings_per_iteration,
scalar_multiplications_per_iteration,
sequential_copy_ops_per_iteration);
}

/**
* @brief Runs loop in parallel if parallelization if useful (costs less than the algorith). The loop function is given
* the index of the workload.
*
* @details Please see run_loop_in_parallel_if_effective_internal for detailed description
*
*/
inline void run_loop_in_parallel_if_effective_with_index(size_t num_points,
const std::function<void(size_t, size_t, size_t)>& func,
size_t finite_field_additions_per_iteration = 0,
size_t finite_field_multiplications_per_iteration = 0,
size_t finite_field_inversions_per_iteration = 0,
size_t group_element_additions_per_iteration = 0,
size_t group_element_doublings_per_iteration = 0,
size_t scalar_multiplications_per_iteration = 0,
size_t sequential_copy_ops_per_iteration = 0

)
{
run_loop_in_parallel_if_effective_internal(num_points,
func,
finite_field_additions_per_iteration,
finite_field_multiplications_per_iteration,
finite_field_inversions_per_iteration,
group_element_additions_per_iteration,
group_element_doublings_per_iteration,
scalar_multiplications_per_iteration,
sequential_copy_ops_per_iteration);
}

0 comments on commit 701688e

Please sign in to comment.