refactor(bb): simplify parallel_for_if_effective (#8079)

The chunkiness of `run_loop_in_parallel_if_effective` was getting in the way of doing polynomial refactor passes for structured polynomials. - Renamed parallel for variants to all start with parallel_for - Doing math on the spot with constants was simpler, plus other simplifications. - Removed the variant that takes two parameters, and introduced a one parameter variant as just having an 'i' parameter felt like the sweet spot, plus a lot of places where the thread/chunk index was not used I do plan to use in a followup as we can avoid mutexes that way and be more efficient. Bundled changes: - There was an unnecessary polynomial copy in IPA - Also, introduce a SlabVector class where we were using SlabContainerAllocator, with plans to use it more (and possibly make it equal to normal std::vector for native code where memory fragmentation really doesn't matter)
AztecProtocol · Aug 20, 2024 · 5bff26b · 5bff26b · AztecBot · Aug 20, 2024
1 parent f668907
commit 5bff26b
Show file tree

Hide file tree

Showing 20 changed files with 189 additions and 400 deletions.
diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -2,6 +2,7 @@
 #include "barretenberg/commitment_schemes/claim.hpp"
 #include "barretenberg/commitment_schemes/verification_key.hpp"
 #include "barretenberg/common/assert.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/transcript/transcript.hpp"
 #include <cstddef>
@@ -90,8 +91,6 @@ template <typename Curve_> class IPA {
 #ifdef IPA_FUZZ_TEST
    friend class ProxyCaller;
 #endif
-   // clang-format off
-
    /**
     * @brief Compute an inner product argument proof for opening a single polynomial at a single evaluation point.
     *
@@ -128,16 +127,14 @@ template <typename Curve_> class IPA {
     *
     *7. Send the final \f$\vec{a}_{0} = (a_0)\f$ to the verifier
     */
-   template <typename Transcript>
-   static void compute_opening_proof_internal(const std::shared_ptr<CK>& ck,
-                                              const ProverOpeningClaim<Curve>& opening_claim,
-                                              const std::shared_ptr<Transcript>& transcript)
-   {
-
-        Polynomial polynomial = opening_claim.polynomial;
+    template <typename Transcript>
+    static void compute_opening_proof_internal(const std::shared_ptr<CK>& ck,
+                                               const ProverOpeningClaim<Curve>& opening_claim,
+                                               const std::shared_ptr<Transcript>& transcript)
+    {
+        const Polynomial& polynomial = opening_claim.polynomial;
 
-        // clang-format on
-        auto poly_length = static_cast<size_t>(polynomial.size());
+        size_t poly_length = polynomial.size();
 
         // Step 1.
         // Send polynomial degree + 1 = d to the verifier
@@ -169,36 +166,27 @@ template <typename Curve_> class IPA {
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             poly_length,
-            [&G_vec_local, srs_elements](size_t start, size_t end) {
+            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                 for (size_t i = start * 2; i < end * 2; i += 2) {
                     G_vec_local[i >> 1] = srs_elements[i];
                 }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+            }, thread_heuristics::FF_COPY_COST);
 
         // Step 5.
         // Compute vector b (vector of the powers of the challenge)
         OpeningPair<Curve> opening_pair = opening_claim.opening_pair;
         std::vector<Fr> b_vec(poly_length);
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             poly_length,
-            [&b_vec, &opening_pair](size_t start, size_t end) {
+            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                 Fr b_power = opening_pair.challenge.pow(start);
                 for (size_t i = start; i < end; i++) {
                     b_vec[i] = b_power;
                     b_power *= opening_pair.challenge;
                 }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/1);
+            }, thread_heuristics::FF_COPY_COST + thread_heuristics::FF_MULTIPLICATION_COST);
 
         // Iterate for log(poly_degree) rounds to compute the round commitments.
         auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_length));
@@ -221,18 +209,9 @@ template <typename Curve_> class IPA {
             Fr inner_prod_L = Fr::zero();
             Fr inner_prod_R = Fr::zero();
             // Run scalar products in parallel
-            run_loop_in_parallel_if_effective(
+            parallel_for_heuristic(
                 round_size,
-                [&a_vec,
-                 &b_vec,
-                 round_size,
-                 &inner_prod_L,
-                 &inner_prod_R
-#ifndef NO_MULTITHREADING
-                 ,
-                 &inner_product_accumulation_mutex
-#endif
-            ](size_t start, size_t end) {
+                [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                     Fr current_inner_prod_L = Fr::zero();
                     Fr current_inner_prod_R = Fr::zero();
                     for (size_t j = start; j < end; j++) {
@@ -247,9 +226,7 @@ template <typename Curve_> class IPA {
                         inner_prod_L += current_inner_prod_L;
                         inner_prod_R += current_inner_prod_R;
                     }
-                },
-                /*finite_field_additions_per_iteration=*/2,
-                /*finite_field_multiplications_per_iteration=*/2);
+                }, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);
 
             // Step 6.a (using letters, because doxygen automaticall converts the sublist counters to letters :( )
             // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
@@ -281,29 +258,24 @@ template <typename Curve_> class IPA {
             // Step 6.e
             // G_vec_new = G_vec_lo + G_vec_hi * round_challenge_inv
             auto G_hi_by_inverse_challenge = GroupElement::batch_mul_with_endomorphism(
-                std::span{ G_vec_local.begin() + static_cast<long>(round_size),
-                           G_vec_local.begin() + static_cast<long>(round_size * 2) },
+                std::span{ G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size),
+                           G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size * 2) },
                 round_challenge_inv);
             GroupElement::batch_affine_add(
-                std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
+                std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size) },
                 G_hi_by_inverse_challenge,
                 G_vec_local);
 
             // Steps 6.e and 6.f
             // Update the vectors a_vec, b_vec.
             // a_vec_new = a_vec_lo + a_vec_hi * round_challenge
             // b_vec_new = b_vec_lo + b_vec_hi * round_challenge_inv
-            run_loop_in_parallel_if_effective(
+            parallel_for_heuristic(
                 round_size,
-                [&a_vec, &b_vec, round_challenge, round_challenge_inv, round_size](size_t start, size_t end) {
-                    for (size_t j = start; j < end; j++) {
-                        a_vec[j] += round_challenge * a_vec[round_size + j];
-                        b_vec[j] += round_challenge_inv * b_vec[round_size + j];
-                    }
-                },
-                /*finite_field_additions_per_iteration=*/4,
-                /*finite_field_multiplications_per_iteration=*/8,
-                /*finite_field_inversions_per_iteration=*/1);
+                [&](size_t j) {
+                    a_vec[j] += round_challenge * a_vec[round_size + j];
+                    b_vec[j] += round_challenge_inv * b_vec[round_size + j];
+                }, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);
         }
 
         // Step 7
@@ -409,23 +381,19 @@ template <typename Curve_> class IPA {
         // TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its
         // O(nlogn). This can be optimized to be linear by computing a tree of products. Its very readable, so we're
         // leaving it unoptimized for now.
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             poly_length,
-            [&s_vec, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
-                for (size_t i = start; i < end; i++) {
-                    Fr s_vec_scalar = Fr::one();
-                    for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
-                        auto bit = (i >> j) & 1;
-                        bool b = static_cast<bool>(bit);
-                        if (b) {
-                            s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
-                        }
+            [&](size_t i) {
+                Fr s_vec_scalar = Fr::one();
+                for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
+                    auto bit = (i >> j) & 1;
+                    bool b = static_cast<bool>(bit);
+                    if (b) {
+                        s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
                     }
-                    s_vec[i] = s_vec_scalar;
                 }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/log_poly_degree);
+                s_vec[i] = s_vec_scalar;
+            }, thread_heuristics::FF_MULTIPLICATION_COST * log_poly_degree);
 
         auto* srs_elements = vk->get_monomial_points();
 
@@ -435,20 +403,13 @@ template <typename Curve_> class IPA {
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             poly_length,
-            [&G_vec_local, srs_elements](size_t start, size_t end) {
+            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                 for (size_t i = start * 2; i < end * 2; i += 2) {
                     G_vec_local[i >> 1] = srs_elements[i];
                 }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+            }, thread_heuristics::FF_COPY_COST * 2);
 
         // Step 8.
         // Compute G₀
@@ -497,7 +458,7 @@ template <typename Curve_> class IPA {
         // Ensure polynomial length cannot be changed from its default specified valued
         poly_length_var.fix_witness();
 
-        const uint32_t poly_length = static_cast<uint32_t>(poly_length_var.get_value());
+        const auto poly_length = static_cast<uint32_t>(poly_length_var.get_value());
 
         // Step 2.
         // Receive generator challenge u and compute auxiliary generator
@@ -559,7 +520,7 @@ template <typename Curve_> class IPA {
         // O(nlogn). This can be optimized to be linear by computing a tree of products.
         for (size_t i = 0; i < poly_length; i++) {
             Fr s_vec_scalar = Fr(1);
-            for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
+            for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
                 auto bit = (i >> j) & 1;
                 bool b = static_cast<bool>(bit);
                 if (b) {

diff --git a/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
@@ -23,4 +23,6 @@
 #else
 #define BB_LIKELY(x) x
 #define BB_UNLIKELY(x) x
-#endif
+#endif
+
+#define BB_UNUSED [[maybe_unused]]
diff --git a/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp b/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp
@@ -5,6 +5,7 @@
 #include <map>
 #include <memory>
 #include <unordered_map>
+#include <vector>
 #ifndef NO_MULTITHREADING
 #include <mutex>
 #endif
@@ -75,4 +76,9 @@ template <typename T> class ContainerSlabAllocator {
     }
 };
 
+/**
+ * @brief A vector that uses the slab allocator.
+ */
+template <typename T> using SlabVector = std::vector<T, bb::ContainerSlabAllocator<T>>;
+
 } // namespace bb
diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp
@@ -99,9 +99,9 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
  * @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
  *
  */
-void run_loop_in_parallel(size_t num_points,
-                          const std::function<void(size_t, size_t)>& func,
-                          size_t no_multhreading_if_less_or_equal)
+void parallel_for_range(size_t num_points,
+                        const std::function<void(size_t, size_t)>& func,
+                        size_t no_multhreading_if_less_or_equal)
 {
     if (num_points <= no_multhreading_if_less_or_equal) {
         func(0, num_points);
@@ -129,45 +129,10 @@ void run_loop_in_parallel(size_t num_points,
     });
 };
 
-/**
- * @brief Split a loop into several loops running in parallel based on operations in 1 iteration
- *
- * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
- * that should contain the work loop, but only if it's worth it
- * @param num_points Total number of elements
- * @param func A function or lambda expression with a for loop inside, for example:
- * [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
- * Or for the version with index:
- * [](size_t start, size_t end,size_t workload_index){for (size_t i=start; i<end; i++){(void)i;}}
- * @param finite_field_additions_per_iteration The number of additions/subtractions/negations
- * @param finite_field_multiplications_per_iteration The number of finite field multiplications and squarings
- * @param finite_field_inversions_per_iteration
- * @param group_element_additions_per_iteration Projective addition number
- * @param group_element_doublings_per_iteration Projective doubling number
- * @param scalar_multiplications_per_iteration
- * @param sequential_copy_ops_per_iteration Field element (16 byte) sequential copy number
- */
-template <typename FunctionType>
-    requires(std::is_same_v<FunctionType, std::function<void(size_t, size_t)>> ||
-             std::is_same_v<FunctionType, std::function<void(size_t, size_t, size_t)>>)
-void run_loop_in_parallel_if_effective_internal(size_t num_points,
-                                                const FunctionType& func,
-                                                size_t finite_field_additions_per_iteration,
-                                                size_t finite_field_multiplications_per_iteration,
-                                                size_t finite_field_inversions_per_iteration,
-                                                size_t group_element_additions_per_iteration,
-                                                size_t group_element_doublings_per_iteration,
-                                                size_t scalar_multiplications_per_iteration,
-                                                size_t sequential_copy_ops_per_iteration)
+void parallel_for_heuristic(size_t num_points,
+                            const std::function<void(size_t, size_t, size_t)>& func,
+                            size_t heuristic_cost)
 {
-    // Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds):
-    constexpr size_t FF_ADDITION_COST = 4;
-    constexpr size_t FF_MULTIPLICATION_COST = 21;
-    constexpr size_t FF_INVERSION_COST = 7000;
-    constexpr size_t GE_ADDITION_COST = 350;
-    constexpr size_t GE_DOUBLING_COST = 194;
-    constexpr size_t SM_COST = 50000;
-    constexpr size_t SEQ_COPY_COST = 3;
     // We take the maximum observed parallel_for cost (388 us) and round it up.
     // The goals of these checks is to evade significantly (10x) increasing processing time for small workloads. So we
     // can accept not triggering parallel_for if the workload would become faster by half a millisecond for medium
@@ -180,23 +145,11 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points,
     const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
 
     // Compute the cost of all operations done by other threads
-    const size_t offset_cost =
-        (num_points - chunk_size) *
-        (finite_field_additions_per_iteration * FF_ADDITION_COST +
-         finite_field_multiplications_per_iteration * FF_MULTIPLICATION_COST +
-         finite_field_inversions_per_iteration * FF_INVERSION_COST +
-         group_element_additions_per_iteration * GE_ADDITION_COST +
-         group_element_doublings_per_iteration * GE_DOUBLING_COST + scalar_multiplications_per_iteration * SM_COST +
-         sequential_copy_ops_per_iteration * SEQ_COPY_COST);
+    const size_t offset_cost = (num_points - chunk_size) * heuristic_cost;
 
     // If starting parallel for is longer than computing, just compute
     if (offset_cost < PARALLEL_FOR_COST) {
-        if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {
-
-            func(0, num_points);
-        } else {
-            func(0, num_points, 0);
-        }
+        func(0, num_points, 0);
         return;
     }
     // Parallelize over chunks
@@ -213,18 +166,9 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points,
         size_t start = chunk_index * chunk_size;
         size_t end = chunk_index * chunk_size + current_chunk_size;
 
-        if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {
-
-            func(start, end);
-        } else {
-            func(start, end, chunk_index);
-        }
+        func(start, end, chunk_index);
     });
 };
-template void run_loop_in_parallel_if_effective_internal(
-    size_t, const std::function<void(size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
-template void run_loop_in_parallel_if_effective_internal(
-    size_t, const std::function<void(size_t, size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
 
 /**
  * @brief calculates number of threads to create based on minimum iterations per thread