feat: Analyze % of time spent on field arithmetic (#4501)

## Benchmark GoblinFull/1 ``` ------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------- GoblinBench/GoblinFull/1 69409 ms 12395 ms 1 aggregating 517841027 counts of asm_add_with_coarse_reduction at time 2.2429419969079083 ns. aggregating 2385088 counts of asm_conditional_negate at time 1.4026850702603462 ns. aggregating 268246459 counts of asm_mul_with_coarse_reduction at time 16.81106049038619 ns. aggregating 972751082 counts of asm_self_add_with_coarse_reduction at time 2.276063249739074 ns. aggregating 1740115104 counts of asm_self_mul_with_coarse_reduction at time 18.54383332441474 ns. aggregating 177404712 counts of asm_self_reduce_once at time 2.2564670875677177 ns. aggregating 294979853 counts of asm_self_sqr_with_coarse_reduction at time 19.935956005726094 ns. aggregating 605409741 counts of asm_self_sub_with_coarse_reduction at time 2.530065801186028 ns. aggregating 189416246 counts of asm_sqr_with_coarse_reduction at time 15.879287202925925 ns. Time spent on field ops: 50.977s. ``` $50.977/69.409 = 0.734$ ## Benchmark GoblinFull/6 (the "medium-complexity transaction case") ``` ------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------- GoblinBench/GoblinFull/6 221121 ms 37562 ms 1 aggregating 1673577237 counts of asm_add_with_coarse_reduction at time 2.2429419969079083 ns. aggregating 12834880 counts of asm_conditional_negate at time 1.4026850702603462 ns. aggregating 796532808 counts of asm_mul_with_coarse_reduction at time 16.81106049038619 ns. aggregating 3011401875 counts of asm_self_add_with_coarse_reduction at time 2.276063249739074 ns. aggregating 5865206849 counts of asm_self_mul_with_coarse_reduction at time 18.54383332441474 ns. aggregating 565422726 counts of asm_self_reduce_once at time 2.2564670875677177 ns. aggregating 1428616720 counts of asm_self_sqr_with_coarse_reduction at time 19.935956005726094 ns. aggregating 2034002527 counts of asm_self_sub_with_coarse_reduction at time 2.530065801186028 ns. aggregating 659886656 counts of asm_sqr_with_coarse_reduction at time 15.879287202925925 ns. Time spent on field ops: 178.161s. ``` $178.161/221.121 = 0.806$ --------- Co-authored-by: ludamad <adam.domurad@gmail.com> Co-authored-by: ludamad <adam@aztecprotocol.com>
AztecProtocol · Feb 13, 2024 · 5ddfa16 · 5ddfa16
1 parent 5285010
commit 5ddfa16
Show file tree

Hide file tree

Showing 8 changed files with 367 additions and 30 deletions.
diff --git a/barretenberg/cpp/scripts/benchmark_field_ops_percentage.sh b/barretenberg/cpp/scripts/benchmark_field_ops_percentage.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+set -eu
+
+TARGET=${1:-goblin_bench}
+FILTER=${2:-./"GoblinFull/1$"}
+COMMAND=${2:-./$TARGET}
+
+BUILD_OP_COUNT_TRACK_DIR=build\-op\-count-track
+
+# Move above script dir.
+cd $(dirname $0)/..
+
+# Measure the benchmarks with ops counting
+cmake --preset op-count-track
+cmake --build --preset op-count-track --target $TARGET
+# This can be run multithreaded
+cd $BUILD_OP_COUNT_TRACK_DIR
+./bin/$TARGET --benchmark_filter=$FILTER\
+              --benchmark_out=$TARGET.json\
+              --benchmark_out_format=json\
+              --benchmark_counters_tabular=true\
+
+# If needed, benchmark the basic Fr operations
+FIELD_OP_COSTS=field_op_costs.json
+if [ ! -f $FIELD_OP_COSTS ]; then
+    cd ../
+    FIELD_OPS_TARGET=fr_straight_bench
+    cmake --preset clang16
+    cmake --build --preset clang16 --target $FIELD_OPS_TARGET
+    cd build
+    ./bin/$FIELD_OPS_TARGET --benchmark_out=../$BUILD_OP_COUNT_TRACK_DIR/$FIELD_OP_COSTS \
+                            --benchmark_out_format=json
+fi
+
+# Compute the singly-threaded benchmarks for comparison
+cd ../
+./scripts/benchmark_remote.sh goblin_bench "taskset -c 0 ./goblin_bench --benchmark_filter=Full/1$"
+
+# Analyze the results
+python3 ./scripts/compute_field_operations_time.py
diff --git a/barretenberg/cpp/scripts/compute_field_operations_time.py b/barretenberg/cpp/scripts/compute_field_operations_time.py
@@ -0,0 +1,63 @@
+import json
+from pathlib import Path
+
+PREFIX = Path("build-op-count-track")
+OPS_BENCH = Path("field_op_costs.json")
+GOBLIN_BENCH_JSON = Path("goblin_bench.json")
+BENCHMARK = "GoblinBench/GoblinFull/1"
+
+# We will populate time per operation for a subset of the operations
+# For accurate counting, we must select operations that do not call other
+# operations on the list.
+ns_per_op = {}
+to_keep = [
+    "asm_add_with_coarse_reduction",
+    "asm_conditional_negate",
+    "asm_mul_with_coarse_reduction",
+    # "asm_reduce_once",
+    "asm_self_add_with_coarse_reduction",
+    "asm_self_mul_with_coarse_reduction",
+    "asm_self_reduce_once",
+    "asm_self_sqr_with_coarse_reduction",
+    "asm_self_sub_with_coarse_reduction",
+    "asm_sqr_with_coarse_reduction",
+    # "mul",
+    # "self_mul",
+    # "add",
+    # "self_add",
+    # "sub",
+    # "self_sub",
+    # "invert", // mostly just self_sqr and *=
+    # "self_neg",
+    # "self_reduce_once",
+    # "self_to_montgomery_form",
+    # "self_sqr",
+    # "sqr",
+]
+
+# read the measuremens of the basic field operations
+with open(PREFIX/OPS_BENCH, "r") as read_file:
+    read_result = json.load(read_file)
+    for bench in read_result["benchmarks"]:
+        if bench["name"] in to_keep:
+            ns_per_op[bench["name"]] = bench["real_time"]
+
+with open(PREFIX/GOBLIN_BENCH_JSON, "r") as read_file:
+    read_result = json.load(read_file)
+    for bench in read_result["benchmarks"]:
+        if bench["name"] == BENCHMARK:
+            mct = bench
+
+total_time = 0
+
+for (key, time) in ns_per_op.items():
+    full_key = "fr::" + key
+    if (full_key in mct.keys()):
+        count = int(mct[full_key])
+        if (count is not None):
+            print(f'aggregating { count } counts of {key} at time {ns_per_op[key]} ns.')
+            total_time += count * ns_per_op[key]
+
+total_time /= 1e9
+
+print(f'Time spent on field ops: {round(total_time, 3)}s.')
diff --git a/barretenberg/cpp/src/barretenberg/common/op_count.hpp b/barretenberg/cpp/src/barretenberg/common/op_count.hpp
@@ -100,6 +100,8 @@ template <OperationLabel Op> struct GlobalOpCount {
         }
         ensure_stats();
         stats->cycles += cycles;
+#else
+        static_cast<void>(cycles);
 #endif
     }
     static constexpr void add_clock_time(std::size_t time)
@@ -111,6 +113,8 @@ template <OperationLabel Op> struct GlobalOpCount {
         }
         ensure_stats();
         stats->time += time;
+#else
+        static_cast<void>(time);
 #endif
     }
 };
@@ -143,4 +147,4 @@ struct OpCountTimeReporter {
 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
 #define BB_OP_COUNT_TIME()                                                                                             \
     bb::detail::OpCountTimeReporter __bb_op_count_time(bb::detail::GlobalOpCount<__func__>::ensure_stats())
-#endif
+#endif
diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr_straight.bench.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr_straight.bench.cpp
@@ -0,0 +1,209 @@
+#include "fr.hpp"
+
+#include <benchmark/benchmark.h>
+
+using namespace bb;
+using namespace benchmark;
+
+namespace {
+void asm_add_with_coarse_reduction(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        DoNotOptimize(fr::asm_add_with_coarse_reduction(x, y));
+    }
+}
+BENCHMARK(asm_add_with_coarse_reduction);
+
+void asm_conditional_negate(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        fr::asm_conditional_negate(x, true);
+    }
+}
+BENCHMARK(asm_conditional_negate);
+
+void asm_mul_with_coarse_reduction(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        DoNotOptimize(fr::asm_mul_with_coarse_reduction(x, y));
+    }
+}
+BENCHMARK(asm_mul_with_coarse_reduction);
+
+void asm_reduce_once(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        DoNotOptimize(fr::asm_reduce_once(x));
+    }
+}
+BENCHMARK(asm_reduce_once);
+
+void asm_self_add_with_coarse_reduction(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        fr::asm_self_add_with_coarse_reduction(x, y);
+    }
+}
+BENCHMARK(asm_self_add_with_coarse_reduction);
+
+void asm_self_mul_with_coarse_reduction(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        fr::asm_self_mul_with_coarse_reduction(x, y);
+    }
+}
+BENCHMARK(asm_self_mul_with_coarse_reduction);
+
+void asm_self_reduce_once(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        fr::asm_self_reduce_once(x);
+    }
+}
+BENCHMARK(asm_self_reduce_once);
+
+void asm_self_sqr_with_coarse_reduction(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        fr::asm_self_sqr_with_coarse_reduction(x);
+    }
+}
+BENCHMARK(asm_self_sqr_with_coarse_reduction);
+
+void asm_self_sub_with_coarse_reduction(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        fr::asm_self_sub_with_coarse_reduction(x, y);
+    }
+}
+BENCHMARK(asm_self_sub_with_coarse_reduction);
+
+void asm_sqr_with_coarse_reduction(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        DoNotOptimize(fr::asm_sqr_with_coarse_reduction(x));
+    }
+}
+BENCHMARK(asm_sqr_with_coarse_reduction);
+
+void mul(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        DoNotOptimize(x * y);
+    }
+}
+BENCHMARK(mul);
+
+void self_mul(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        x *= y;
+    }
+}
+BENCHMARK(self_mul);
+
+void add(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        DoNotOptimize(x + y);
+    }
+}
+BENCHMARK(add);
+
+void self_add(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        x += y;
+    }
+}
+BENCHMARK(self_add);
+
+void sub(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        DoNotOptimize(x - y);
+    }
+}
+BENCHMARK(sub);
+
+void self_sub(State& state) noexcept
+{
+    fr x, y;
+    for (auto _ : state) {
+        x -= y;
+    }
+}
+BENCHMARK(self_sub);
+
+void invert(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        DoNotOptimize(x.invert());
+    }
+}
+BENCHMARK(invert);
+
+void self_neg(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        x.self_neg();
+    }
+}
+BENCHMARK(self_neg);
+
+void self_reduce_once(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        x.self_reduce_once();
+    }
+}
+BENCHMARK(self_reduce_once);
+
+void self_to_montgomery_form(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        x.self_to_montgomery_form();
+    }
+}
+BENCHMARK(self_to_montgomery_form);
+
+void self_sqr(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        x.self_sqr();
+    }
+}
+BENCHMARK(self_sqr);
+
+void sqr(State& state) noexcept
+{
+    fr x;
+    for (auto _ : state) {
+        DoNotOptimize(x.sqr());
+    }
+}
+BENCHMARK(sqr);
+} // namespace
+
+// NOLINTNEXTLINE macro invokation triggers style guideline errors from googletest code
+BENCHMARK_MAIN();
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
@@ -475,7 +475,6 @@ template <class Params_> struct alignas(32) field {
     void msgpack_unpack(auto o);
     void msgpack_schema(auto& packer) const { packer.pack_alias(Params::schema_name, "bin32"); }
 
-  private:
     static constexpr uint256_t twice_modulus = modulus + modulus;
     static constexpr uint256_t not_modulus = -modulus;
     static constexpr uint256_t twice_not_modulus = -twice_modulus;