From 5ed230915fdc0bc23a35f58c8fd015e58d5b0bb7 Mon Sep 17 00:00:00 2001 From: ludamad Date: Mon, 19 Aug 2024 23:55:06 +0000 Subject: [PATCH 01/10] ipa polynomial opt --- .../cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index bdb31f2eac5..b96cc9b0f65 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -134,7 +134,7 @@ template class IPA { const std::shared_ptr& transcript) { - Polynomial polynomial = opening_claim.polynomial; + const Polynomial& polynomial = opening_claim.polynomial; // clang-format on auto poly_length = static_cast(polynomial.size()); From 39c4691fbb85adebe7912a1b463a7032be4ca1b1 Mon Sep 17 00:00:00 2001 From: ludamad Date: Tue, 20 Aug 2024 00:36:20 +0000 Subject: [PATCH 02/10] Introduce convenient SlabVector shorthand --- .../commitment_schemes/ipa/ipa.hpp | 22 +++++-------------- .../barretenberg/common/slab_allocator.hpp | 6 +++++ .../dsl/acir_format/acir_format.hpp | 10 +++------ .../honk_recursion_constraint.test.cpp | 2 +- .../acir_format/recursion_constraint.test.cpp | 2 +- .../dsl/acir_proofs/acir_composer.hpp | 2 +- .../execution_trace/execution_trace.hpp | 2 +- .../arithmetization/arithmetization.hpp | 5 +++-- .../translator_circuit_builder.hpp | 2 +- 9 files changed, 23 insertions(+), 30 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index b96cc9b0f65..7b16c220944 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -90,8 +90,6 @@ template class IPA { #ifdef IPA_FUZZ_TEST friend class ProxyCaller; #endif - // clang-format off - /** * @brief Compute an inner product argument proof for opening a single polynomial at a single evaluation point. * @@ -128,16 +126,14 @@ template class IPA { * *7. Send the final \f$\vec{a}_{0} = (a_0)\f$ to the verifier */ - template - static void compute_opening_proof_internal(const std::shared_ptr& ck, - const ProverOpeningClaim& opening_claim, - const std::shared_ptr& transcript) - { - + template + static void compute_opening_proof_internal(const std::shared_ptr& ck, + const ProverOpeningClaim& opening_claim, + const std::shared_ptr& transcript) + { const Polynomial& polynomial = opening_claim.polynomial; - // clang-format on - auto poly_length = static_cast(polynomial.size()); + size_t poly_length = polynomial.size(); // Step 1. // Send polynomial degree + 1 = d to the verifier @@ -208,11 +204,9 @@ template class IPA { GroupElement R_i; std::size_t round_size = poly_length; -#ifndef NO_MULTITHREADING // The inner products we'll be computing in parallel need a mutex to be thread-safe during the last // accumulation std::mutex inner_product_accumulation_mutex; -#endif // Step 6. // Perform IPA reduction rounds for (size_t i = 0; i < log_poly_degree; i++) { @@ -228,10 +222,8 @@ template class IPA { round_size, &inner_prod_L, &inner_prod_R -#ifndef NO_MULTITHREADING , &inner_product_accumulation_mutex -#endif ](size_t start, size_t end) { Fr current_inner_prod_L = Fr::zero(); Fr current_inner_prod_R = Fr::zero(); @@ -241,9 +233,7 @@ template class IPA { } // Update the accumulated results thread-safely { -#ifndef NO_MULTITHREADING std::unique_lock lock(inner_product_accumulation_mutex); -#endif inner_prod_L += current_inner_prod_L; inner_prod_R += current_inner_prod_R; } diff --git a/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp b/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp index 1eb03b1ae3a..fbdd310756f 100644 --- a/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp +++ b/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #ifndef NO_MULTITHREADING #include #endif @@ -75,4 +76,9 @@ template class ContainerSlabAllocator { } }; +/** + * @brief A vector that uses the slab allocator. + */ +template using SlabVector = std::vector>; + } // namespace bb \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp index 81773ed655f..06aec65a0ff 100644 --- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp @@ -102,12 +102,8 @@ struct AcirFormat { // A standard plonk arithmetic constraint, as defined in the poly_triple struct, consists of selector values // for q_M,q_L,q_R,q_O,q_C and indices of three variables taking the role of left, right and output wire // This could be a large vector so use slab allocator, we don't expect the blackbox implementations to be so large. - std::vector, - bb::ContainerSlabAllocator>> - poly_triple_constraints; - std::vector, - bb::ContainerSlabAllocator>> - quad_constraints; + bb::SlabVector> poly_triple_constraints; + bb::SlabVector> quad_constraints; std::vector block_constraints; // Number of gates added to the circuit per original opcode. @@ -148,7 +144,7 @@ struct AcirFormat { friend bool operator==(AcirFormat const& lhs, AcirFormat const& rhs) = default; }; -using WitnessVector = std::vector>; +using WitnessVector = bb::SlabVector; using WitnessVectorStack = std::vector>; struct AcirProgram { diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/honk_recursion_constraint.test.cpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/honk_recursion_constraint.test.cpp index 86ac113e76d..dc3e1222f7e 100644 --- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/honk_recursion_constraint.test.cpp +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/honk_recursion_constraint.test.cpp @@ -141,7 +141,7 @@ class AcirHonkRecursionConstraint : public ::testing::Test { std::vector honk_recursion_constraints; size_t witness_offset = 0; - std::vector> witness; + SlabVector witness; for (auto& inner_circuit : inner_circuits) { diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/recursion_constraint.test.cpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/recursion_constraint.test.cpp index 95b650f13d3..2e6384e13c2 100644 --- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/recursion_constraint.test.cpp +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/recursion_constraint.test.cpp @@ -139,7 +139,7 @@ Builder create_outer_circuit(std::vector& inner_circuits) std::vector recursion_constraints; size_t witness_offset = 0; - std::vector> witness; + SlabVector witness; for (auto& inner_circuit : inner_circuits) { auto inner_composer = Composer(); diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_proofs/acir_composer.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_proofs/acir_composer.hpp index f53011ec374..f222381c429 100644 --- a/barretenberg/cpp/src/barretenberg/dsl/acir_proofs/acir_composer.hpp +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_proofs/acir_composer.hpp @@ -11,7 +11,7 @@ namespace acir_proofs { */ class AcirComposer { - using WitnessVector = std::vector>; + using WitnessVector = bb::SlabVector; public: AcirComposer(size_t size_hint = 0, bool verbose = true); diff --git a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp index 39ddf504706..7c375848f2a 100644 --- a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp +++ b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp @@ -10,7 +10,7 @@ template class ExecutionTrace_ { using Polynomial = typename Flavor::Polynomial; using FF = typename Flavor::FF; using TrackBlocks = typename Builder::Arithmetization::TraceBlocks; - using Wires = std::array>, Builder::NUM_WIRES>; + using Wires = std::array, Builder::NUM_WIRES>; using ProvingKey = typename Flavor::ProvingKey; public: diff --git a/barretenberg/cpp/src/barretenberg/plonk_honk_shared/arithmetization/arithmetization.hpp b/barretenberg/cpp/src/barretenberg/plonk_honk_shared/arithmetization/arithmetization.hpp index 14789bfa263..90351184809 100644 --- a/barretenberg/cpp/src/barretenberg/plonk_honk_shared/arithmetization/arithmetization.hpp +++ b/barretenberg/cpp/src/barretenberg/plonk_honk_shared/arithmetization/arithmetization.hpp @@ -1,5 +1,6 @@ #pragma once #include "barretenberg/common/ref_array.hpp" +#include "barretenberg/common/slab_allocator.hpp" #include #ifdef CHECK_CIRCUIT_STACKTRACES @@ -39,8 +40,8 @@ enum class TraceStructure { NONE, SMALL_TEST, CLIENT_IVC_BENCH, AZTEC_IVC_BENCH, */ template class ExecutionTraceBlock { public: - using SelectorType = std::vector>; - using WireType = std::vector>; + using SelectorType = SlabVector; + using WireType = SlabVector; using Selectors = std::array; using Wires = std::array; diff --git a/barretenberg/cpp/src/barretenberg/translator_vm/translator_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/translator_vm/translator_circuit_builder.hpp index d72cb072bf3..9b9f6b9de30 100644 --- a/barretenberg/cpp/src/barretenberg/translator_vm/translator_circuit_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/translator_vm/translator_circuit_builder.hpp @@ -322,7 +322,7 @@ class TranslatorCircuitBuilder : public CircuitBuilderBase { // The input we evaluate polynomials on Fq evaluation_input_x; - std::array>, NUM_WIRES> wires; + std::array, NUM_WIRES> wires; /** * @brief Construct a new Translator Circuit Builder object From 910cba5d2e317ab3b5ba04b2fea2a9245ccccba0 Mon Sep 17 00:00:00 2001 From: ludamad Date: Tue, 20 Aug 2024 02:14:31 +0000 Subject: [PATCH 03/10] grand parallel for refactor --- .../commitment_schemes/ipa/ipa.hpp | 95 +++---- .../barretenberg/common/compiler_hints.hpp | 4 +- .../cpp/src/barretenberg/common/thread.cpp | 74 +----- .../cpp/src/barretenberg/common/thread.hpp | 105 ++++---- .../barretenberg/ecc/groups/element_impl.hpp | 232 +++++------------- .../eccvm/eccvm_circuit_builder.hpp | 2 +- .../src/barretenberg/eccvm/eccvm_flavor.hpp | 6 +- .../src/barretenberg/eccvm/msm_builder.hpp | 4 +- .../eccvm/precomputed_tables_builder.hpp | 2 +- .../grumpkin_srs_gen/grumpkin_srs_gen.cpp | 2 +- .../protogalaxy/protogalaxy_prover.hpp | 6 +- .../protogalaxy/protogalaxy_prover_impl.hpp | 4 +- .../translator_vm/translator_prover.cpp | 2 +- 13 files changed, 165 insertions(+), 373 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 7b16c220944..191296a0ed8 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -2,6 +2,7 @@ #include "barretenberg/commitment_schemes/claim.hpp" #include "barretenberg/commitment_schemes/verification_key.hpp" #include "barretenberg/common/assert.hpp" +#include "barretenberg/common/thread.hpp" #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/transcript/transcript.hpp" #include @@ -165,36 +166,27 @@ template class IPA { // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - run_loop_in_parallel_if_effective( + parallel_for_heuristic( poly_length, - [&G_vec_local, srs_elements](size_t start, size_t end) { + [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) { for (size_t i = start * 2; i < end * 2; i += 2) { G_vec_local[i >> 1] = srs_elements[i]; } - }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + }, thread_heuristics::FF_COPY_COST); // Step 5. // Compute vector b (vector of the powers of the challenge) OpeningPair opening_pair = opening_claim.opening_pair; std::vector b_vec(poly_length); - run_loop_in_parallel_if_effective( + parallel_for_heuristic( poly_length, - [&b_vec, &opening_pair](size_t start, size_t end) { + [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) { Fr b_power = opening_pair.challenge.pow(start); for (size_t i = start; i < end; i++) { b_vec[i] = b_power; b_power *= opening_pair.challenge; } - }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/1); + }, thread_heuristics::FF_COPY_COST + thread_heuristics::FF_MULTIPLICATION_COST); // Iterate for log(poly_degree) rounds to compute the round commitments. auto log_poly_degree = static_cast(numeric::get_msb(poly_length)); @@ -215,16 +207,9 @@ template class IPA { Fr inner_prod_L = Fr::zero(); Fr inner_prod_R = Fr::zero(); // Run scalar products in parallel - run_loop_in_parallel_if_effective( + parallel_for_heuristic( round_size, - [&a_vec, - &b_vec, - round_size, - &inner_prod_L, - &inner_prod_R - , - &inner_product_accumulation_mutex - ](size_t start, size_t end) { + [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) { Fr current_inner_prod_L = Fr::zero(); Fr current_inner_prod_R = Fr::zero(); for (size_t j = start; j < end; j++) { @@ -237,9 +222,7 @@ template class IPA { inner_prod_L += current_inner_prod_L; inner_prod_R += current_inner_prod_R; } - }, - /*finite_field_additions_per_iteration=*/2, - /*finite_field_multiplications_per_iteration=*/2); + }, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2); // Step 6.a (using letters, because doxygen automaticall converts the sublist counters to letters :( ) // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator @@ -271,11 +254,11 @@ template class IPA { // Step 6.e // G_vec_new = G_vec_lo + G_vec_hi * round_challenge_inv auto G_hi_by_inverse_challenge = GroupElement::batch_mul_with_endomorphism( - std::span{ G_vec_local.begin() + static_cast(round_size), - G_vec_local.begin() + static_cast(round_size * 2) }, + std::span{ G_vec_local.begin() + static_cast(round_size), + G_vec_local.begin() + static_cast(round_size * 2) }, round_challenge_inv); GroupElement::batch_affine_add( - std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast(round_size) }, + std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast(round_size) }, G_hi_by_inverse_challenge, G_vec_local); @@ -283,17 +266,12 @@ template class IPA { // Update the vectors a_vec, b_vec. // a_vec_new = a_vec_lo + a_vec_hi * round_challenge // b_vec_new = b_vec_lo + b_vec_hi * round_challenge_inv - run_loop_in_parallel_if_effective( + parallel_for_heuristic( round_size, - [&a_vec, &b_vec, round_challenge, round_challenge_inv, round_size](size_t start, size_t end) { - for (size_t j = start; j < end; j++) { - a_vec[j] += round_challenge * a_vec[round_size + j]; - b_vec[j] += round_challenge_inv * b_vec[round_size + j]; - } - }, - /*finite_field_additions_per_iteration=*/4, - /*finite_field_multiplications_per_iteration=*/8, - /*finite_field_inversions_per_iteration=*/1); + [&](size_t j) { + a_vec[j] += round_challenge * a_vec[round_size + j]; + b_vec[j] += round_challenge_inv * b_vec[round_size + j]; + }, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2); } // Step 7 @@ -399,23 +377,19 @@ template class IPA { // TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its // O(nlogn). This can be optimized to be linear by computing a tree of products. Its very readable, so we're // leaving it unoptimized for now. - run_loop_in_parallel_if_effective( + parallel_for_heuristic( poly_length, - [&s_vec, &round_challenges_inv, log_poly_degree](size_t start, size_t end) { - for (size_t i = start; i < end; i++) { - Fr s_vec_scalar = Fr::one(); - for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) { - auto bit = (i >> j) & 1; - bool b = static_cast(bit); - if (b) { - s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j]; - } + [&](size_t i) { + Fr s_vec_scalar = Fr::one(); + for (size_t j = (log_poly_degree - 1); j != static_cast(-1); j--) { + auto bit = (i >> j) & 1; + bool b = static_cast(bit); + if (b) { + s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j]; } - s_vec[i] = s_vec_scalar; } - }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/log_poly_degree); + s_vec[i] = s_vec_scalar; + }, thread_heuristics::FF_MULTIPLICATION_COST * log_poly_degree); auto* srs_elements = vk->get_monomial_points(); @@ -425,20 +399,13 @@ template class IPA { // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - run_loop_in_parallel_if_effective( + parallel_for_heuristic( poly_length, - [&G_vec_local, srs_elements](size_t start, size_t end) { + [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) { for (size_t i = start * 2; i < end * 2; i += 2) { G_vec_local[i >> 1] = srs_elements[i]; } - }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + }, thread_heuristics::FF_COPY_COST * 2); // Step 8. // Compute G₀ diff --git a/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp index 9492475cc1c..2ff5feb85b1 100644 --- a/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp +++ b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp @@ -23,4 +23,6 @@ #else #define BB_LIKELY(x) x #define BB_UNLIKELY(x) x -#endif \ No newline at end of file +#endif + +#define BB_UNUSED [[maybe_unused]] \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp index 8aec8be2cfd..97a278bb360 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.cpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp @@ -99,9 +99,9 @@ void parallel_for(size_t num_iterations, const std::function& func * @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization * */ -void run_loop_in_parallel(size_t num_points, - const std::function& func, - size_t no_multhreading_if_less_or_equal) +void parallel_for_threshold(size_t num_points, + const std::function& func, + size_t no_multhreading_if_less_or_equal) { if (num_points <= no_multhreading_if_less_or_equal) { func(0, num_points); @@ -129,45 +129,10 @@ void run_loop_in_parallel(size_t num_points, }); }; -/** - * @brief Split a loop into several loops running in parallel based on operations in 1 iteration - * - * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function - * that should contain the work loop, but only if it's worth it - * @param num_points Total number of elements - * @param func A function or lambda expression with a for loop inside, for example: - * [](size_t start, size_t end){for (size_t i=start; i - requires(std::is_same_v> || - std::is_same_v>) -void run_loop_in_parallel_if_effective_internal(size_t num_points, - const FunctionType& func, - size_t finite_field_additions_per_iteration, - size_t finite_field_multiplications_per_iteration, - size_t finite_field_inversions_per_iteration, - size_t group_element_additions_per_iteration, - size_t group_element_doublings_per_iteration, - size_t scalar_multiplications_per_iteration, - size_t sequential_copy_ops_per_iteration) +void parallel_for_heuristic(size_t num_points, + const std::function& func, + size_t heuristic_cost) { - // Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds): - constexpr size_t FF_ADDITION_COST = 4; - constexpr size_t FF_MULTIPLICATION_COST = 21; - constexpr size_t FF_INVERSION_COST = 7000; - constexpr size_t GE_ADDITION_COST = 350; - constexpr size_t GE_DOUBLING_COST = 194; - constexpr size_t SM_COST = 50000; - constexpr size_t SEQ_COPY_COST = 3; // We take the maximum observed parallel_for cost (388 us) and round it up. // The goals of these checks is to evade significantly (10x) increasing processing time for small workloads. So we // can accept not triggering parallel_for if the workload would become faster by half a millisecond for medium @@ -180,23 +145,11 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points, const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1); // Compute the cost of all operations done by other threads - const size_t offset_cost = - (num_points - chunk_size) * - (finite_field_additions_per_iteration * FF_ADDITION_COST + - finite_field_multiplications_per_iteration * FF_MULTIPLICATION_COST + - finite_field_inversions_per_iteration * FF_INVERSION_COST + - group_element_additions_per_iteration * GE_ADDITION_COST + - group_element_doublings_per_iteration * GE_DOUBLING_COST + scalar_multiplications_per_iteration * SM_COST + - sequential_copy_ops_per_iteration * SEQ_COPY_COST); + const size_t offset_cost = (num_points - chunk_size) * heuristic_cost; // If starting parallel for is longer than computing, just compute if (offset_cost < PARALLEL_FOR_COST) { - if constexpr (std::is_same_v>) { - - func(0, num_points); - } else { - func(0, num_points, 0); - } + func(0, num_points, 0); return; } // Parallelize over chunks @@ -213,18 +166,9 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points, size_t start = chunk_index * chunk_size; size_t end = chunk_index * chunk_size + current_chunk_size; - if constexpr (std::is_same_v>) { - - func(start, end); - } else { - func(start, end, chunk_index); - } + func(start, end, chunk_index); }); }; -template void run_loop_in_parallel_if_effective_internal( - size_t, const std::function&, size_t, size_t, size_t, size_t, size_t, size_t, size_t); -template void run_loop_in_parallel_if_effective_internal( - size_t, const std::function&, size_t, size_t, size_t, size_t, size_t, size_t, size_t); /** * @brief calculates number of threads to create based on minimum iterations per thread diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp index 98ade3f4593..ab85f394674 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.hpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.hpp @@ -1,4 +1,5 @@ #pragma once +#include "barretenberg/common/compiler_hints.hpp" #include #include #include @@ -24,75 +25,39 @@ inline size_t get_num_cpus_pow2() * @param num_iterations Number of iterations * @param func Function to run in parallel * Observe that num_iterations is NOT the thread pool size. - * The size will be chosen based on the hardware concurrency (i.e., env or cpus).. + * The size will be chosen based on the hardware concurrency (i.e., env or cpus). */ void parallel_for(size_t num_iterations, const std::function& func); -void run_loop_in_parallel(size_t num_points, - const std::function& func, - size_t no_multhreading_if_less_or_equal = 0); +void parallel_for_threshold(size_t num_points, + const std::function& func, + size_t no_multhreading_if_less_or_equal = 0); -template - requires(std::is_same_v> || - std::is_same_v>) -void run_loop_in_parallel_if_effective_internal( - size_t, const FunctionType&, size_t, size_t, size_t, size_t, size_t, size_t, size_t); /** - * @brief Runs loop in parallel if parallelization if useful (costs less than the algorithm) - * - * @details Please see run_loop_in_parallel_if_effective_internal for detailed description - * - */ -inline void run_loop_in_parallel_if_effective(size_t num_points, - const std::function& func, - size_t finite_field_additions_per_iteration = 0, - size_t finite_field_multiplications_per_iteration = 0, - size_t finite_field_inversions_per_iteration = 0, - size_t group_element_additions_per_iteration = 0, - size_t group_element_doublings_per_iteration = 0, - size_t scalar_multiplications_per_iteration = 0, - size_t sequential_copy_ops_per_iteration = 0 - -) -{ - run_loop_in_parallel_if_effective_internal(num_points, - func, - finite_field_additions_per_iteration, - finite_field_multiplications_per_iteration, - finite_field_inversions_per_iteration, - group_element_additions_per_iteration, - group_element_doublings_per_iteration, - scalar_multiplications_per_iteration, - sequential_copy_ops_per_iteration); -} - -/** - * @brief Runs loop in parallel if parallelization if useful (costs less than the algorith). The loop function is given - * the index of the workload. - * - * @details Please see run_loop_in_parallel_if_effective_internal for detailed description + * @brief Split a loop into several loops running in parallel based on operations in 1 iteration * + * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function + * that should contain the work loop, but only if it's worth it + * @param num_points Total number of elements + * @param func A function or lambda expression with a for loop inside, for example: + * [&](size_t start, size_t end, size_t thread_index){for (size_t i=start; i& func, - size_t finite_field_additions_per_iteration = 0, - size_t finite_field_multiplications_per_iteration = 0, - size_t finite_field_inversions_per_iteration = 0, - size_t group_element_additions_per_iteration = 0, - size_t group_element_doublings_per_iteration = 0, - size_t scalar_multiplications_per_iteration = 0, - size_t sequential_copy_ops_per_iteration = 0 +void parallel_for_heuristic(size_t num_points, + const std::function& func, + size_t heuristic_cost); -) +template + requires std::invocable +void parallel_for_heuristic(size_t num_points, const Func& func, size_t heuristic_cost) { - run_loop_in_parallel_if_effective_internal(num_points, - func, - finite_field_additions_per_iteration, - finite_field_multiplications_per_iteration, - finite_field_inversions_per_iteration, - group_element_additions_per_iteration, - group_element_doublings_per_iteration, - scalar_multiplications_per_iteration, - sequential_copy_ops_per_iteration); + parallel_for_heuristic( + num_points, + [&](size_t start_idx, size_t end_idx, BB_UNUSED size_t chunk_index) { + for (size_t i = start_idx; i < end_idx; i++) { + func(i); + } + }, + heuristic_cost); } const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4; @@ -119,4 +84,22 @@ size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_th size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD); +namespace thread_heuristics { +// Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds) +// Field element (16 byte) addition cost +constexpr size_t FF_ADDITION_COST = 4; +// Field element (16 byte) multiplication cost +constexpr size_t FF_MULTIPLICATION_COST = 21; +// Field element (16 byte) inversion cost +constexpr size_t FF_INVERSION_COST = 7000; +// Group element projective addition number +constexpr size_t GE_ADDITION_COST = 350; +// Group element projective doubling number +constexpr size_t GE_DOUBLING_COST = 194; +// Group element scalar multiplication cost +constexpr size_t SM_COST = 50000; +// Field element (16 byte) sequential copy number +constexpr size_t FF_COPY_COST = 3; +} // namespace thread_heuristics + } // namespace bb diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp index 19bc945e476..6630406b89a 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp @@ -728,20 +728,8 @@ void element::batch_affine_add(const std::span scratch_space(num_points); - run_loop_in_parallel_if_effective( - num_points, - [&results, &first_group](size_t start, size_t end) { - for (size_t i = start; i < end; i++) { - results[i] = first_group[i]; - } - }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/2); + parallel_for_heuristic( + num_points, [&](size_t i) { results[i] = first_group[i]; }, thread_heuristics::FF_COPY_COST * 2); // TODO(#826): Same code as in batch mul // we can mutate rhs but NOT lhs! @@ -779,16 +767,14 @@ void element::batch_affine_add(const std::span> element::batch_mul_with_endomo */ const auto batch_affine_add_internal = [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) { - run_loop_in_parallel_if_effective( + parallel_for_heuristic( num_points, - [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { + [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) { batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); }, - /*finite_field_additions_per_iteration=*/6, - /*finite_field_multiplications_per_iteration=*/6); + thread_heuristics::FF_ADDITION_COST * 6 + thread_heuristics::FF_MULTIPLICATION_COST * 6); }; /** @@ -896,13 +881,12 @@ std::vector> element::batch_mul_with_endomo * */ const auto batch_affine_double = [num_points, &scratch_space, &batch_affine_double_chunked](affine_element* lhs) { - run_loop_in_parallel_if_effective( + parallel_for_heuristic( num_points, - [&lhs, &scratch_space, &batch_affine_double_chunked](size_t start, size_t end) { + [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) { batch_affine_double_chunked(lhs + start, end - start, &scratch_space[0] + start); }, - /*finite_field_additions_per_iteration=*/7, - /*finite_field_multiplications_per_iteration=*/6); + thread_heuristics::FF_ADDITION_COST * 7 + thread_heuristics::FF_MULTIPLICATION_COST * 6); }; // We compute the resulting point through WNAF by evaluating (the (\sum_i (16ⁱ⋅ @@ -912,22 +896,9 @@ std::vector> element::batch_mul_with_endomo // computing p⋅Point, we get a point at infinity, which is an edgecase, and we don't want to handle edgecases in the // hot loop since the slow the computation down. So it's better to just handle it here. if (scalar == -Fr::one()) { - std::vector results(num_points); - run_loop_in_parallel_if_effective( - num_points, - [&results, &points](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - results[i] = -points[i]; - } - }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + parallel_for_heuristic( + num_points, [&](size_t i) { results[i] = -points[i]; }, thread_heuristics::FF_COPY_COST); return results; } // Compute wnaf for scalar @@ -938,20 +909,8 @@ std::vector> element::batch_mul_with_endomo affine_element result{ Fq::zero(), Fq::zero() }; result.self_set_infinity(); std::vector results(num_points); - run_loop_in_parallel_if_effective( - num_points, - [&results, result](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - results[i] = result; - } - }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + parallel_for_heuristic( + num_points, [&](size_t i) { results[i] = result; }, thread_heuristics::FF_COPY_COST); return results; } @@ -963,41 +922,23 @@ std::vector> element::batch_mul_with_endomo } // Initialize first etnries in lookup table std::vector temp_point_vector(num_points); - run_loop_in_parallel_if_effective( + parallel_for_heuristic( num_points, - [&temp_point_vector, &lookup_table, &points](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - // If the point is at infinity we fix-up the result later - // To avoid 'trying to invert zero in the field' we set the point to 'one' here - temp_point_vector[i] = points[i].is_point_at_infinity() ? affine_element::one() : points[i]; - lookup_table[0][i] = points[i].is_point_at_infinity() ? affine_element::one() : points[i]; - } + [&](size_t i) { + // If the point is at infinity we fix-up the result later + // To avoid 'trying to invert zero in the field' we set the point to 'one' here + temp_point_vector[i] = points[i].is_point_at_infinity() ? affine_element::one() : points[i]; + lookup_table[0][i] = points[i].is_point_at_infinity() ? affine_element::one() : points[i]; }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/2); + thread_heuristics::FF_COPY_COST * 2); // Construct lookup table batch_affine_double(&temp_point_vector[0]); for (size_t j = 1; j < LOOKUP_SIZE; ++j) { - run_loop_in_parallel_if_effective( + parallel_for_heuristic( num_points, - [j, &lookup_table](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - lookup_table[j][i] = lookup_table[j - 1][i]; - } - }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + [&](size_t i) { lookup_table[j][i] = lookup_table[j - 1][i]; }, + thread_heuristics::FF_COPY_COST); batch_affine_add_internal(&temp_point_vector[0], &lookup_table[j][0]); } @@ -1016,31 +957,22 @@ std::vector> element::batch_mul_with_endomo index = wnaf_entry & 0x0fffffffU; sign = static_cast((wnaf_entry >> 31) & 1); const bool is_odd = ((j & 1) == 1); - run_loop_in_parallel_if_effective( + parallel_for_heuristic( num_points, - [j, index, is_odd, sign, beta, &lookup_table, &work_elements, &temp_point_vector](size_t start, - size_t end) { - for (size_t i = start; i < end; ++i) { - - auto to_add = lookup_table[static_cast(index)][i]; - to_add.y.self_conditional_negate(sign ^ is_odd); - if (is_odd) { - to_add.x *= beta; - } - if (j == 0) { - work_elements[i] = to_add; - } else { - temp_point_vector[i] = to_add; - } + [&](size_t i) { + auto to_add = lookup_table[static_cast(index)][i]; + to_add.y.self_conditional_negate(sign ^ is_odd); + if (is_odd) { + to_add.x *= beta; + } + if (j == 0) { + work_elements[i] = to_add; + } else { + temp_point_vector[i] = to_add; } }, - /*finite_field_additions_per_iteration=*/1, - /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + (is_odd ? thread_heuristics::FF_MULTIPLICATION_COST : 0) + thread_heuristics::FF_COPY_COST + + thread_heuristics::FF_ADDITION_COST); } // First cycle of addition batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); @@ -1055,83 +987,47 @@ std::vector> element::batch_mul_with_endomo batch_affine_double(&work_elements[0]); } } - run_loop_in_parallel_if_effective( + parallel_for_heuristic( num_points, - [index, is_odd, sign, beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - - auto to_add = lookup_table[static_cast(index)][i]; - to_add.y.self_conditional_negate(sign ^ is_odd); - if (is_odd) { - to_add.x *= beta; - } - temp_point_vector[i] = to_add; + [&](size_t i) { + auto to_add = lookup_table[static_cast(index)][i]; + to_add.y.self_conditional_negate(sign ^ is_odd); + if (is_odd) { + to_add.x *= beta; } + temp_point_vector[i] = to_add; }, - /*finite_field_additions_per_iteration=*/1, - /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + (is_odd ? thread_heuristics::FF_MULTIPLICATION_COST : 0) + thread_heuristics::FF_COPY_COST + + thread_heuristics::FF_ADDITION_COST); batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); } // Apply skew for the first endo scalar if (wnaf.skew) { - run_loop_in_parallel_if_effective( + parallel_for_heuristic( num_points, - [&lookup_table, &temp_point_vector](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - - temp_point_vector[i] = -lookup_table[0][i]; - } - }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/0, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + [&](size_t i) { temp_point_vector[i] = -lookup_table[0][i]; }, + thread_heuristics::FF_ADDITION_COST + thread_heuristics::FF_COPY_COST); batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); } // Apply skew for the second endo scalar if (wnaf.endo_skew) { - run_loop_in_parallel_if_effective( + parallel_for_heuristic( num_points, - [beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - temp_point_vector[i] = lookup_table[0][i]; - temp_point_vector[i].x *= beta; - } + [&](size_t i) { + temp_point_vector[i] = lookup_table[0][i]; + temp_point_vector[i].x *= beta; }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/1, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + thread_heuristics::FF_MULTIPLICATION_COST + thread_heuristics::FF_COPY_COST); batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); } // handle points at infinity explicitly - run_loop_in_parallel_if_effective( + parallel_for_heuristic( num_points, - [&](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - work_elements[i] = - points[i].is_point_at_infinity() ? work_elements[i].set_infinity() : work_elements[i]; - } + [&](size_t i) { + work_elements[i] = points[i].is_point_at_infinity() ? work_elements[i].set_infinity() : work_elements[i]; }, - /*finite_field_additions_per_iteration=*/0, - /*finite_field_multiplications_per_iteration=*/1, - /*finite_field_inversions_per_iteration=*/0, - /*group_element_additions_per_iteration=*/0, - /*group_element_doublings_per_iteration=*/0, - /*scalar_multiplications_per_iteration=*/0, - /*sequential_copy_ops_per_iteration=*/1); + thread_heuristics::FF_COPY_COST); return work_elements; } diff --git a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp index 5d560639642..72af418b841 100644 --- a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp @@ -144,7 +144,7 @@ class ECCVMCircuitBuilder { msm.resize(msm_sizes[i]); } - run_loop_in_parallel(msm_opqueue_index.size(), [&](size_t start, size_t end) { + parallel_for_threshold(msm_opqueue_index.size(), [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { const auto& op = raw_ops[msm_opqueue_index[i]]; auto [msm_index, mul_index] = msm_mul_index[i]; diff --git a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp index 89b3b5a1593..ba55591beda 100644 --- a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp +++ b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp @@ -524,7 +524,7 @@ class ECCVMFlavor { } // compute polynomials for transcript columns - run_loop_in_parallel(transcript_rows.size(), [&](size_t start, size_t end) { + parallel_for_threshold(transcript_rows.size(), [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { transcript_accumulator_empty[i] = transcript_rows[i].accumulator_empty; transcript_add[i] = transcript_rows[i].q_add; @@ -575,7 +575,7 @@ class ECCVMFlavor { transcript_accumulator_y[i] = transcript_accumulator_y[i - 1]; } - run_loop_in_parallel(point_table_rows.size(), [&](size_t start, size_t end) { + parallel_for_threshold(point_table_rows.size(), [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { // first row is always an empty row (to accommodate shifted polynomials which must have 0 as 1st // coefficient). All other rows in the point_table_rows represent active wnaf gates (i.e. @@ -605,7 +605,7 @@ class ECCVMFlavor { }); // compute polynomials for the msm columns - run_loop_in_parallel(msm_rows.size(), [&](size_t start, size_t end) { + parallel_for_threshold(msm_rows.size(), [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { msm_transition[i] = static_cast(msm_rows[i].msm_transition); msm_add[i] = static_cast(msm_rows[i].q_add); diff --git a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp index 006e21b7685..40a160a14ed 100644 --- a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp @@ -341,13 +341,13 @@ class ECCVMMSMMBuilder { } // Normalize the points in the point trace - run_loop_in_parallel(points_to_normalize.size(), [&](size_t start, size_t end) { + parallel_for_threshold(points_to_normalize.size(), [&](size_t start, size_t end) { Element::batch_normalize(&points_to_normalize[start], end - start); }); // inverse_trace is used to compute the value of the `collision_inverse` column in the ECCVM. std::vector inverse_trace(num_point_adds_and_doubles); - run_loop_in_parallel(num_point_adds_and_doubles, [&](size_t start, size_t end) { + parallel_for_threshold(num_point_adds_and_doubles, [&](size_t start, size_t end) { for (size_t operation_idx = start; operation_idx < end; ++operation_idx) { if (operation_trace[operation_idx]) { inverse_trace[operation_idx] = (p1_trace[operation_idx].y + p1_trace[operation_idx].y); diff --git a/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp index c98e1d56b8b..0b10c34a36e 100644 --- a/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp @@ -46,7 +46,7 @@ class ECCVMPointTablePrecomputationBuilder { // current impl doesn't work if not 4 static_assert(WNAF_DIGITS_PER_ROW == 4); - run_loop_in_parallel(ecc_muls.size(), [&](size_t start, size_t end) { + parallel_for_threshold(ecc_muls.size(), [&](size_t start, size_t end) { for (size_t j = start; j < end; j++) { const auto& entry = ecc_muls[j]; const auto& slices = entry.wnaf_digits; diff --git a/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp b/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp index bb5d7cf5d3c..afc00706202 100644 --- a/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp +++ b/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp @@ -40,7 +40,7 @@ int main(int argc, char** argv) #ifndef NO_MULTITHREADING std::mutex vector_access_mutex; #endif - run_loop_in_parallel(subgroup_size, [&](size_t start, size_t end) { + parallel_for_threshold(subgroup_size, [&](size_t start, size_t end) { std::vector hash_input; for (size_t point_idx = start; point_idx < end; ++point_idx) { bool rational_point_found = false; diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.hpp index e45c5448b69..51c6fcaf655 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.hpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.hpp @@ -163,7 +163,7 @@ template class ProtoGalaxyProver_ { std::mutex evaluation_mutex; #endif auto linearly_dependent_contribution_accumulator = FF(0); - run_loop_in_parallel(instance_size, [&](size_t start_row, size_t end_row) { + parallel_for_threshold(instance_size, [&](size_t start_row, size_t end_row) { auto thread_accumulator = FF(0); for (size_t row = start_row; row < end_row; row++) { // TODO(https://github.com/AztecProtocol/barretenberg/issues/940): avoid get_row if possible. @@ -220,7 +220,7 @@ template class ProtoGalaxyProver_ { auto prev_level_width = prev_level_coeffs.size(); // we need degree + 1 terms to represent the intermediate polynomials std::vector> level_coeffs(prev_level_width >> 1, std::vector(degree + 1, 0)); - run_loop_in_parallel( + parallel_for_threshold( prev_level_width >> 1, [&](size_t start, size_t end) { for (size_t node = start << 1; node < end << 1; node += 2) { @@ -253,7 +253,7 @@ template class ProtoGalaxyProver_ { { auto width = full_honk_evaluations.size(); std::vector> first_level_coeffs(width >> 1, std::vector(2, 0)); - run_loop_in_parallel(width >> 1, [&](size_t start, size_t end) { + parallel_for_threshold(width >> 1, [&](size_t start, size_t end) { // Run loop in parallel can divide the domain in such way that the indices are odd, which we can't tolerate // here, so first we divide the width by two, enable parallelism and then reconstruct even start and end for (size_t node = start << 1; node < end << 1; node += 2) { diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp index 4199e2cf8ed..3040cd18a45 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp @@ -81,7 +81,7 @@ std::shared_ptr ProtoGalaxyProver_proving_key.polynomials.get_all(); - run_loop_in_parallel(Flavor::NUM_FOLDED_ENTITIES, [&](size_t start_idx, size_t end_idx) { + parallel_for_threshold(Flavor::NUM_FOLDED_ENTITIES, [&](size_t start_idx, size_t end_idx) { for (size_t poly_idx = start_idx; poly_idx < end_idx; poly_idx++) { auto& acc_poly = accumulator_polys[poly_idx]; for (auto& acc_el : acc_poly) { @@ -93,7 +93,7 @@ std::shared_ptr ProtoGalaxyProver_proving_key.polynomials.get_all(); - run_loop_in_parallel(Flavor::NUM_FOLDED_ENTITIES, [&](size_t start_idx, size_t end_idx) { + parallel_for_threshold(Flavor::NUM_FOLDED_ENTITIES, [&](size_t start_idx, size_t end_idx) { for (size_t poly_idx = start_idx; poly_idx < end_idx; poly_idx++) { auto& acc_poly = accumulator_polys[poly_idx]; auto& inst_poly = input_polys[poly_idx]; diff --git a/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp b/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp index 560b208f605..ed3def696da 100644 --- a/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp +++ b/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp @@ -34,7 +34,7 @@ void TranslatorProver::compute_witness(CircuitBuilder& circuit_builder) // Populate the wire polynomials from the wire vectors in the circuit constructor. Note: In goblin translator wires // come as is, since they have to reflect the structure of polynomials in the first 4 wires, which we've commited to for (auto [wire_poly, wire] : zip_view(key->polynomials.get_wires(), circuit_builder.wires)) { - run_loop_in_parallel(circuit_builder.num_gates, [&](size_t start, size_t end) { + parallel_for_threshold(circuit_builder.num_gates, [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { wire_poly[i] = circuit_builder.get_variable(wire[i]); } From fc85c7e1f1180bf7daa9cf21049353f776e2a827 Mon Sep 17 00:00:00 2001 From: ludamad Date: Tue, 20 Aug 2024 02:22:53 +0000 Subject: [PATCH 04/10] grand parallel for refactor --- .../cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 191296a0ed8..8bd2dce7189 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -454,7 +454,7 @@ template class IPA { // Ensure polynomial length cannot be changed from its default specified valued poly_length_var.fix_witness(); - const uint32_t poly_length = static_cast(poly_length_var.get_value()); + const auto poly_length = static_cast(poly_length_var.get_value()); // Step 2. // Receive generator challenge u and compute auxiliary generator @@ -516,7 +516,7 @@ template class IPA { // O(nlogn). This can be optimized to be linear by computing a tree of products. for (size_t i = 0; i < poly_length; i++) { Fr s_vec_scalar = Fr(1); - for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) { + for (size_t j = (log_poly_degree - 1); j != static_cast(-1); j--) { auto bit = (i >> j) & 1; bool b = static_cast(bit); if (b) { From e5ad3bf843f65e85041d658ed9d819b5644da1e8 Mon Sep 17 00:00:00 2001 From: ludamad Date: Tue, 20 Aug 2024 02:32:56 +0000 Subject: [PATCH 05/10] compile fixes --- .../cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp | 4 ++++ barretenberg/cpp/src/barretenberg/common/thread.cpp | 6 +++--- barretenberg/cpp/src/barretenberg/common/thread.hpp | 6 +++--- .../cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp | 2 +- barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp | 6 +++--- barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp | 4 ++-- .../src/barretenberg/eccvm/precomputed_tables_builder.hpp | 2 +- .../src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp | 2 +- .../barretenberg/protogalaxy/protogalaxy_prover_impl.hpp | 6 +++--- .../src/barretenberg/translator_vm/translator_prover.cpp | 2 +- 10 files changed, 22 insertions(+), 18 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 8bd2dce7189..bac4516fa83 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -196,9 +196,11 @@ template class IPA { GroupElement R_i; std::size_t round_size = poly_length; +#ifndef NO_MULTITHREADING // The inner products we'll be computing in parallel need a mutex to be thread-safe during the last // accumulation std::mutex inner_product_accumulation_mutex; +#endif // Step 6. // Perform IPA reduction rounds for (size_t i = 0; i < log_poly_degree; i++) { @@ -218,7 +220,9 @@ template class IPA { } // Update the accumulated results thread-safely { +#ifndef NO_MULTITHREADING std::unique_lock lock(inner_product_accumulation_mutex); +#endif inner_prod_L += current_inner_prod_L; inner_prod_R += current_inner_prod_R; } diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp index 97a278bb360..19eb38e00ab 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.cpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp @@ -99,9 +99,9 @@ void parallel_for(size_t num_iterations, const std::function& func * @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization * */ -void parallel_for_threshold(size_t num_points, - const std::function& func, - size_t no_multhreading_if_less_or_equal) +void parallel_for_range(size_t num_points, + const std::function& func, + size_t no_multhreading_if_less_or_equal) { if (num_points <= no_multhreading_if_less_or_equal) { func(0, num_points); diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp index ab85f394674..7227e139d00 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.hpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.hpp @@ -28,9 +28,9 @@ inline size_t get_num_cpus_pow2() * The size will be chosen based on the hardware concurrency (i.e., env or cpus). */ void parallel_for(size_t num_iterations, const std::function& func); -void parallel_for_threshold(size_t num_points, - const std::function& func, - size_t no_multhreading_if_less_or_equal = 0); +void parallel_for_range(size_t num_points, + const std::function& func, + size_t no_multhreading_if_less_or_equal = 0); /** * @brief Split a loop into several loops running in parallel based on operations in 1 iteration diff --git a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp index 72af418b841..6121469064a 100644 --- a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp @@ -144,7 +144,7 @@ class ECCVMCircuitBuilder { msm.resize(msm_sizes[i]); } - parallel_for_threshold(msm_opqueue_index.size(), [&](size_t start, size_t end) { + parallel_for_range(msm_opqueue_index.size(), [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { const auto& op = raw_ops[msm_opqueue_index[i]]; auto [msm_index, mul_index] = msm_mul_index[i]; diff --git a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp index ba55591beda..6affa954598 100644 --- a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp +++ b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp @@ -524,7 +524,7 @@ class ECCVMFlavor { } // compute polynomials for transcript columns - parallel_for_threshold(transcript_rows.size(), [&](size_t start, size_t end) { + parallel_for_range(transcript_rows.size(), [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { transcript_accumulator_empty[i] = transcript_rows[i].accumulator_empty; transcript_add[i] = transcript_rows[i].q_add; @@ -575,7 +575,7 @@ class ECCVMFlavor { transcript_accumulator_y[i] = transcript_accumulator_y[i - 1]; } - parallel_for_threshold(point_table_rows.size(), [&](size_t start, size_t end) { + parallel_for_range(point_table_rows.size(), [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { // first row is always an empty row (to accommodate shifted polynomials which must have 0 as 1st // coefficient). All other rows in the point_table_rows represent active wnaf gates (i.e. @@ -605,7 +605,7 @@ class ECCVMFlavor { }); // compute polynomials for the msm columns - parallel_for_threshold(msm_rows.size(), [&](size_t start, size_t end) { + parallel_for_range(msm_rows.size(), [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { msm_transition[i] = static_cast(msm_rows[i].msm_transition); msm_add[i] = static_cast(msm_rows[i].q_add); diff --git a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp index 40a160a14ed..d0bb9459ad2 100644 --- a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp @@ -341,13 +341,13 @@ class ECCVMMSMMBuilder { } // Normalize the points in the point trace - parallel_for_threshold(points_to_normalize.size(), [&](size_t start, size_t end) { + parallel_for_range(points_to_normalize.size(), [&](size_t start, size_t end) { Element::batch_normalize(&points_to_normalize[start], end - start); }); // inverse_trace is used to compute the value of the `collision_inverse` column in the ECCVM. std::vector inverse_trace(num_point_adds_and_doubles); - parallel_for_threshold(num_point_adds_and_doubles, [&](size_t start, size_t end) { + parallel_for_range(num_point_adds_and_doubles, [&](size_t start, size_t end) { for (size_t operation_idx = start; operation_idx < end; ++operation_idx) { if (operation_trace[operation_idx]) { inverse_trace[operation_idx] = (p1_trace[operation_idx].y + p1_trace[operation_idx].y); diff --git a/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp index 0b10c34a36e..614eea69d34 100644 --- a/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp @@ -46,7 +46,7 @@ class ECCVMPointTablePrecomputationBuilder { // current impl doesn't work if not 4 static_assert(WNAF_DIGITS_PER_ROW == 4); - parallel_for_threshold(ecc_muls.size(), [&](size_t start, size_t end) { + parallel_for_range(ecc_muls.size(), [&](size_t start, size_t end) { for (size_t j = start; j < end; j++) { const auto& entry = ecc_muls[j]; const auto& slices = entry.wnaf_digits; diff --git a/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp b/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp index afc00706202..f49b675c45b 100644 --- a/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp +++ b/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp @@ -40,7 +40,7 @@ int main(int argc, char** argv) #ifndef NO_MULTITHREADING std::mutex vector_access_mutex; #endif - parallel_for_threshold(subgroup_size, [&](size_t start, size_t end) { + parallel_for_range(subgroup_size, [&](size_t start, size_t end) { std::vector hash_input; for (size_t point_idx = start; point_idx < end; ++point_idx) { bool rational_point_found = false; diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp index 17d50d6238f..37fa0c619e3 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp @@ -17,7 +17,7 @@ std::vector::FF> ProtoGalaxyProver std::mutex evaluation_mutex; #endif auto linearly_dependent_contribution_accumulator = FF(0); - run_loop_in_parallel(instance_size, [&](size_t start_row, size_t end_row) { + parallel_for_range(instance_size, [&](size_t start_row, size_t end_row) { auto thread_accumulator = FF(0); for (size_t row = start_row; row < end_row; row++) { auto row_evaluations = instance_polynomials.get_row(row); @@ -60,7 +60,7 @@ std::vector::FF> ProtoGalaxyProver auto degree = level + 1; auto prev_level_width = prev_level_coeffs.size(); std::vector> level_coeffs(prev_level_width >> 1, std::vector(degree + 1, 0)); - run_loop_in_parallel( + parallel_for_range( prev_level_width >> 1, [&](size_t start, size_t end) { for (size_t node = start << 1; node < end << 1; node += 2) { @@ -85,7 +85,7 @@ std::vector::FF> ProtoGalaxyProver { auto width = full_honk_evaluations.size(); std::vector> first_level_coeffs(width >> 1, std::vector(2, 0)); - run_loop_in_parallel(width >> 1, [&](size_t start, size_t end) { + parallel_for_range(width >> 1, [&](size_t start, size_t end) { for (size_t node = start << 1; node < end << 1; node += 2) { auto parent = node >> 1; first_level_coeffs[parent][0] = full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0]; diff --git a/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp b/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp index ed3def696da..2bc903a4592 100644 --- a/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp +++ b/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp @@ -34,7 +34,7 @@ void TranslatorProver::compute_witness(CircuitBuilder& circuit_builder) // Populate the wire polynomials from the wire vectors in the circuit constructor. Note: In goblin translator wires // come as is, since they have to reflect the structure of polynomials in the first 4 wires, which we've commited to for (auto [wire_poly, wire] : zip_view(key->polynomials.get_wires(), circuit_builder.wires)) { - parallel_for_threshold(circuit_builder.num_gates, [&](size_t start, size_t end) { + parallel_for_range(circuit_builder.num_gates, [&](size_t start, size_t end) { for (size_t i = start; i < end; i++) { wire_poly[i] = circuit_builder.get_variable(wire[i]); } From 5ea718c7ee64db4b035eb1275cb142a71ca6b330 Mon Sep 17 00:00:00 2001 From: ludamad Date: Tue, 20 Aug 2024 12:23:28 +0000 Subject: [PATCH 06/10] start of accumulator cleanup --- .../barretenberg/common/compiler_hints.hpp | 2 ++ .../cpp/src/barretenberg/common/container.hpp | 17 +++++++++++- .../cpp/src/barretenberg/common/thread.hpp | 17 ++++++++++++ .../protogalaxy/protogalaxy_prover_impl.hpp | 26 ++++++------------- 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp index 2ff5feb85b1..23898a7d4ee 100644 --- a/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp +++ b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp @@ -25,4 +25,6 @@ #define BB_UNLIKELY(x) x #endif +// Opinionated feature: functionally equivalent to [[maybe_unused]] but clearly +// marks things DEFINITELY unused. Aims to be more readable, at the tradeoff of being a custom thingy. #define BB_UNUSED [[maybe_unused]] \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/common/container.hpp b/barretenberg/cpp/src/barretenberg/common/container.hpp index be010ee5d00..db609cec674 100644 --- a/barretenberg/cpp/src/barretenberg/common/container.hpp +++ b/barretenberg/cpp/src/barretenberg/common/container.hpp @@ -4,6 +4,8 @@ #include #include +namespace bb { + template C slice(C const& container, size_t start) { auto b = container.begin(); @@ -61,4 +63,17 @@ template int64_t index_of(std::vector const& vec, T const& item) auto const& itr = std::find(begin, end, item); return itr == end ? -1 : std::distance(begin, itr); -} \ No newline at end of file +} + +// A simple sum meant for small containers (i.e. doesn't use threading) +template