AztecProtocol · ludamad · Aug 20, 2024 · Aug 19, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -2,12 +2,14 @@
 #include "barretenberg/commitment_schemes/claim.hpp"
 #include "barretenberg/commitment_schemes/verification_key.hpp"
 #include "barretenberg/common/assert.hpp"
+#include "barretenberg/common/container.hpp"
 #include "barretenberg/common/thread.hpp"
 #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/transcript/transcript.hpp"
 #include <cstddef>
 #include <numeric>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace bb {
@@ -168,10 +170,8 @@ template <typename Curve_> class IPA {
         // G_vec_local should use only the original SRS thus we extract only the even indices.
         parallel_for_heuristic(
             poly_length,
-            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
-                for (size_t i = start * 2; i < end * 2; i += 2) {
-                    G_vec_local[i >> 1] = srs_elements[i];
-                }
+            [&](size_t i) {
+                G_vec_local[i] = srs_elements[i * 2];
             }, thread_heuristics::FF_COPY_COST);
 
         // Step 5.
@@ -196,38 +196,22 @@ template <typename Curve_> class IPA {
         GroupElement R_i;
         std::size_t round_size = poly_length;
 
-#ifndef NO_MULTITHREADING
-        //  The inner products we'll be computing in parallel need a mutex to be thread-safe during the last
-        //  accumulation
-        std::mutex inner_product_accumulation_mutex;
-#endif
         // Step 6.
         // Perform IPA reduction rounds
         for (size_t i = 0; i < log_poly_degree; i++) {
-            round_size >>= 1;
-            // Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
-            Fr inner_prod_L = Fr::zero();
-            Fr inner_prod_R = Fr::zero();
+            round_size /= 2;
             // Run scalar products in parallel
-            parallel_for_heuristic(
+            auto inner_prods = parallel_for_heuristic(
                 round_size,
-                [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
-                    Fr current_inner_prod_L = Fr::zero();
-                    Fr current_inner_prod_R = Fr::zero();
-                    for (size_t j = start; j < end; j++) {
-                        current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
-                        current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
-                    }
-                    // Update the accumulated results thread-safely
-                    {
-#ifndef NO_MULTITHREADING
-                        std::unique_lock<std::mutex> lock(inner_product_accumulation_mutex);
-#endif
-                        inner_prod_L += current_inner_prod_L;
-                        inner_prod_R += current_inner_prod_R;
-                    }
+                std::pair{Fr::zero(), Fr::zero()},
+                [&](size_t j, std::pair<Fr, Fr>& inner_prod_left_right) {
+                    // Compute inner_prod_L := < a_vec_lo, b_vec_hi >
+                    inner_prod_left_right.first += a_vec[j] * b_vec[round_size + j];
+                    // Compute inner_prod_R := < a_vec_hi, b_vec_lo >
+                    inner_prod_left_right.second += a_vec[round_size + j] * b_vec[j];
                 }, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);
-
+            // Sum inner product contributions computed in parallel and unpack the std::pair
+            auto [inner_prod_L, inner_prod_R] = sum_pairs(inner_prods);
             // Step 6.a (using letters, because doxygen automaticall converts the sublist counters to letters :( )
             // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
             L_i = bb::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
@@ -376,23 +360,21 @@ template <typename Curve_> class IPA {
 
         // Step 7.
         // Construct vector s
-        std::vector<Fr> s_vec(poly_length);
+        std::vector<Fr> s_vec(poly_length, Fr::one());
 
         // TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its
         // O(nlogn). This can be optimized to be linear by computing a tree of products. Its very readable, so we're
         // leaving it unoptimized for now.
         parallel_for_heuristic(
             poly_length,
             [&](size_t i) {
-                Fr s_vec_scalar = Fr::one();
                 for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
                     auto bit = (i >> j) & 1;
                     bool b = static_cast<bool>(bit);
                     if (b) {
-                        s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
+                        s_vec[i] *= round_challenges_inv[log_poly_degree - 1 - j];
                     }
                 }
-                s_vec[i] = s_vec_scalar;
             }, thread_heuristics::FF_MULTIPLICATION_COST * log_poly_degree);
 
         auto* srs_elements = vk->get_monomial_points();
@@ -405,10 +387,8 @@ template <typename Curve_> class IPA {
         // G_vec_local should use only the original SRS thus we extract only the even indices.
         parallel_for_heuristic(
             poly_length,
-            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
-                for (size_t i = start * 2; i < end * 2; i += 2) {
-                    G_vec_local[i >> 1] = srs_elements[i];
-                }
+            [&](size_t i) {
+                G_vec_local[i] = srs_elements[i * 2];
             }, thread_heuristics::FF_COPY_COST * 2);
 
         // Step 8.

diff --git a/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
@@ -25,4 +25,6 @@
 #define BB_UNLIKELY(x) x
 #endif
 
+// Opinionated feature: functionally equivalent to [[maybe_unused]] but clearly
+// marks things DEFINITELY unused. Aims to be more readable, at the tradeoff of being a custom thingy.
 #define BB_UNUSED [[maybe_unused]]
diff --git a/barretenberg/cpp/src/barretenberg/common/container.hpp b/barretenberg/cpp/src/barretenberg/common/container.hpp
@@ -4,6 +4,8 @@
 #include <string>
 #include <vector>
 
+namespace bb {
+
 template <typename C> C slice(C const& container, size_t start)
 {
     auto b = container.begin();
@@ -61,4 +63,29 @@ template <typename T> int64_t index_of(std::vector<T> const& vec, T const& item)
     auto const& itr = std::find(begin, end, item);
 
     return itr == end ? -1 : std::distance(begin, itr);
-}
+}
+
+// A simple sum meant for small containers (i.e. doesn't use threading)
+template <template <typename, typename...> typename Cont, typename Inner, typename... Args>
+Inner sum(Cont<Inner, Args...> const& in)
+{
+    Inner result{};
+    for (auto& e : in) {
+        result += e;
+    }
+    return result;
+}
+
+// A simple sum meant for small containers (i.e. doesn't use threading)
+template <template <typename, typename...> typename Cont, typename Left, typename Right, typename... Args>
+std::pair<Left, Right> sum_pairs(Cont<std::pair<Left, Right>, Args...> const& in)
+{
+    std::pair<Left, Right> result{ {}, {} };
+    for (auto& e : in) {
+        result.first += e.first;
+        result.second += e.second;
+    }
+    return result;
+}
+
+} // namespace bb
diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp
@@ -60,6 +60,32 @@ void parallel_for_heuristic(size_t num_points, const Func& func, size_t heuristi
         heuristic_cost);
 }
 
+/**
+ * @brief parallel_for_heuristic variant that takes an accumulator initializer
+ * that is allocated in a vector, one accumulator per thread/chunk.
+ * This allows for thread-safe accumulation, see sum() or sum_pairs() in container.hpp
+ * for an easy way to combine the thread/chunk contributions into a final result.
+ */
+template <typename Func, typename Accum>
+    requires std::invocable<Func, std::size_t, Accum&>
+std::vector<Accum> parallel_for_heuristic(size_t num_points,
+                                          const Accum& initial_accum,
+                                          const Func& func,
+                                          size_t heuristic_cost)
+{
+    // thread-safe accumulators
+    std::vector<Accum> accumulators(get_num_cpus(), initial_accum);
+    parallel_for_heuristic(
+        num_points,
+        [&](size_t start_idx, size_t end_idx, size_t chunk_index) {
+            for (size_t i = start_idx; i < end_idx; i++) {
+                func(i, accumulators[chunk_index]);
+            }
+        },
+        heuristic_cost);
+    return accumulators;
+}
+
 const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;
 
 /**
@@ -100,6 +126,8 @@ constexpr size_t GE_DOUBLING_COST = 194;
 constexpr size_t SM_COST = 50000;
 // Field element (16 byte) sequential copy number
 constexpr size_t FF_COPY_COST = 3;
+// Fine default if something looks 'chunky enough that I don't want to calculate'
+constexpr size_t ALWAYS_MULTITHREAD = 100000;
 } // namespace thread_heuristics
 
 } // namespace bb
diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp
@@ -1,4 +1,7 @@
 #pragma once
+#include "barretenberg/common/container.hpp"
+#include "barretenberg/common/op_count.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/flavor/flavor.hpp"
 #include "barretenberg/ultra_honk/oink_prover.hpp"
 #include "protogalaxy_prover.hpp"
@@ -10,16 +13,13 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
                                                      const RelationSeparator& alpha,
                                                      const RelationParameters<FF>& relation_parameters)
 {
+    BB_OP_COUNT_TIME_NAME("ProtoGalaxyProver_::compute_full_honk_evaluations");
     auto instance_size = instance_polynomials.get_polynomial_size();
     std::vector<FF> full_honk_evaluations(instance_size);
-    std::vector<FF> linearly_dependent_contributions(instance_size);
-#ifndef NO_MULTITHREADING
-    std::mutex evaluation_mutex;
-#endif
-    auto linearly_dependent_contribution_accumulator = FF(0);
-    parallel_for_range(instance_size, [&](size_t start_row, size_t end_row) {
-        auto thread_accumulator = FF(0);
-        for (size_t row = start_row; row < end_row; row++) {
+    std::vector<FF> linearly_dependent_contribution_accumulators = parallel_for_heuristic(
+        instance_size,
+        /*accumulator default*/ FF(0),
+        [&](size_t row, FF& linearly_dependent_contribution_accumulator) {
             auto row_evaluations = instance_polynomials.get_row(row);
             RelationEvaluations relation_evaluations;
             Utils::zero_elements(relation_evaluations);
@@ -29,19 +29,13 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
 
             auto output = FF(0);
             auto running_challenge = FF(1);
-            auto linearly_dependent_contribution = FF(0);
             Utils::scale_and_batch_elements(
-                relation_evaluations, alpha, running_challenge, output, linearly_dependent_contribution);
-            thread_accumulator += linearly_dependent_contribution;
+                relation_evaluations, alpha, running_challenge, output, linearly_dependent_contribution_accumulator);
 
             full_honk_evaluations[row] = output;
-        }
-#ifndef NO_MULTITHREADING
-        std::unique_lock<std::mutex> evaluation_lock(evaluation_mutex);
-#endif
-        linearly_dependent_contribution_accumulator += thread_accumulator;
-    });
-    full_honk_evaluations[0] += linearly_dependent_contribution_accumulator;
+        },
+        thread_heuristics::ALWAYS_MULTITHREAD);
+    full_honk_evaluations[0] += sum(linearly_dependent_contribution_accumulators);
     return full_honk_evaluations;
 }
 
@@ -59,20 +53,18 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
 
     auto degree = level + 1;
     auto prev_level_width = prev_level_coeffs.size();
-    std::vector<std::vector<FF>> level_coeffs(prev_level_width >> 1, std::vector<FF>(degree + 1, 0));
-    parallel_for_range(
-        prev_level_width >> 1,
-        [&](size_t start, size_t end) {
-            for (size_t node = start << 1; node < end << 1; node += 2) {
-                auto parent = node >> 1;
-                std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin());
-                for (size_t d = 0; d < degree; d++) {
-                    level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level];
-                    level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level];
-                }
+    std::vector<std::vector<FF>> level_coeffs(prev_level_width / 2, std::vector<FF>(degree + 1, 0));
+    parallel_for_heuristic(
+        prev_level_width / 2,
+        [&](size_t parent) {
+            size_t node = parent * 2;
+            std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin());
+            for (size_t d = 0; d < degree; d++) {
+                level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level];
+                level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level];
             }
         },
-        /*no_multhreading_if_less_or_equal=*/8);
+        /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * degree * 3);
     return construct_coefficients_tree(betas, deltas, level_coeffs, level + 1);
 }
 
@@ -84,14 +76,15 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
                                                           const std::vector<FF>& full_honk_evaluations)
 {
     auto width = full_honk_evaluations.size();
-    std::vector<std::vector<FF>> first_level_coeffs(width >> 1, std::vector<FF>(2, 0));
-    parallel_for_range(width >> 1, [&](size_t start, size_t end) {
-        for (size_t node = start << 1; node < end << 1; node += 2) {
-            auto parent = node >> 1;
+    std::vector<std::vector<FF>> first_level_coeffs(width / 2, std::vector<FF>(2, 0));
+    parallel_for_heuristic(
+        width / 2,
+        [&](size_t parent) {
+            size_t node = parent * 2;
             first_level_coeffs[parent][0] = full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0];
             first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0];
-        }
-    });
+        },
+        /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * 3);
     return construct_coefficients_tree(betas, deltas, first_level_coeffs);
 }