Skip to content

Commit

Permalink
feat: Analyze % of time spent on field arithmetic (#4501)
Browse files Browse the repository at this point in the history
## Benchmark GoblinFull/1
```
------------------------------------------------------------------
Benchmark                         Time             CPU   Iterations
-------------------------------------------------------------------
GoblinBench/GoblinFull/1      69409 ms        12395 ms            1
aggregating 517841027 counts of asm_add_with_coarse_reduction at time 2.2429419969079083 ns.
aggregating 2385088 counts of asm_conditional_negate at time 1.4026850702603462 ns.
aggregating 268246459 counts of asm_mul_with_coarse_reduction at time 16.81106049038619 ns.
aggregating 972751082 counts of asm_self_add_with_coarse_reduction at time 2.276063249739074 ns.
aggregating 1740115104 counts of asm_self_mul_with_coarse_reduction at time 18.54383332441474 ns.
aggregating 177404712 counts of asm_self_reduce_once at time 2.2564670875677177 ns.
aggregating 294979853 counts of asm_self_sqr_with_coarse_reduction at time 19.935956005726094 ns.
aggregating 605409741 counts of asm_self_sub_with_coarse_reduction at time 2.530065801186028 ns.
aggregating 189416246 counts of asm_sqr_with_coarse_reduction at time 15.879287202925925 ns.
Time spent on field ops: 50.977s.
```
$50.977/69.409 = 0.734$

## Benchmark GoblinFull/6 (the "medium-complexity transaction case")
```
-------------------------------------------------------------------
Benchmark                         Time             CPU   Iterations
-------------------------------------------------------------------
GoblinBench/GoblinFull/6     221121 ms        37562 ms            1
aggregating 1673577237 counts of asm_add_with_coarse_reduction at time 2.2429419969079083 ns.
aggregating 12834880 counts of asm_conditional_negate at time 1.4026850702603462 ns.
aggregating 796532808 counts of asm_mul_with_coarse_reduction at time 16.81106049038619 ns.
aggregating 3011401875 counts of asm_self_add_with_coarse_reduction at time 2.276063249739074 ns.
aggregating 5865206849 counts of asm_self_mul_with_coarse_reduction at time 18.54383332441474 ns.
aggregating 565422726 counts of asm_self_reduce_once at time 2.2564670875677177 ns.
aggregating 1428616720 counts of asm_self_sqr_with_coarse_reduction at time 19.935956005726094 ns.
aggregating 2034002527 counts of asm_self_sub_with_coarse_reduction at time 2.530065801186028 ns.
aggregating 659886656 counts of asm_sqr_with_coarse_reduction at time 15.879287202925925 ns.
Time spent on field ops: 178.161s.
```
$178.161/221.121 = 0.806$

---------

Co-authored-by: ludamad <adam.domurad@gmail.com>
Co-authored-by: ludamad <adam@aztecprotocol.com>
  • Loading branch information
3 people authored Feb 13, 2024
1 parent 5285010 commit 5ddfa16
Show file tree
Hide file tree
Showing 8 changed files with 367 additions and 30 deletions.
40 changes: 40 additions & 0 deletions barretenberg/cpp/scripts/benchmark_field_ops_percentage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env bash
set -eu

TARGET=${1:-goblin_bench}
FILTER=${2:-./"GoblinFull/1$"}
COMMAND=${2:-./$TARGET}

BUILD_OP_COUNT_TRACK_DIR=build\-op\-count-track

# Move above script dir.
cd $(dirname $0)/..

# Measure the benchmarks with ops counting
cmake --preset op-count-track
cmake --build --preset op-count-track --target $TARGET
# This can be run multithreaded
cd $BUILD_OP_COUNT_TRACK_DIR
./bin/$TARGET --benchmark_filter=$FILTER\
--benchmark_out=$TARGET.json\
--benchmark_out_format=json\
--benchmark_counters_tabular=true\

# If needed, benchmark the basic Fr operations
FIELD_OP_COSTS=field_op_costs.json
if [ ! -f $FIELD_OP_COSTS ]; then
cd ../
FIELD_OPS_TARGET=fr_straight_bench
cmake --preset clang16
cmake --build --preset clang16 --target $FIELD_OPS_TARGET
cd build
./bin/$FIELD_OPS_TARGET --benchmark_out=../$BUILD_OP_COUNT_TRACK_DIR/$FIELD_OP_COSTS \
--benchmark_out_format=json
fi

# Compute the singly-threaded benchmarks for comparison
cd ../
./scripts/benchmark_remote.sh goblin_bench "taskset -c 0 ./goblin_bench --benchmark_filter=Full/1$"

# Analyze the results
python3 ./scripts/compute_field_operations_time.py
63 changes: 63 additions & 0 deletions barretenberg/cpp/scripts/compute_field_operations_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import json
from pathlib import Path

PREFIX = Path("build-op-count-track")
OPS_BENCH = Path("field_op_costs.json")
GOBLIN_BENCH_JSON = Path("goblin_bench.json")
BENCHMARK = "GoblinBench/GoblinFull/1"

# We will populate time per operation for a subset of the operations
# For accurate counting, we must select operations that do not call other
# operations on the list.
ns_per_op = {}
to_keep = [
"asm_add_with_coarse_reduction",
"asm_conditional_negate",
"asm_mul_with_coarse_reduction",
# "asm_reduce_once",
"asm_self_add_with_coarse_reduction",
"asm_self_mul_with_coarse_reduction",
"asm_self_reduce_once",
"asm_self_sqr_with_coarse_reduction",
"asm_self_sub_with_coarse_reduction",
"asm_sqr_with_coarse_reduction",
# "mul",
# "self_mul",
# "add",
# "self_add",
# "sub",
# "self_sub",
# "invert", // mostly just self_sqr and *=
# "self_neg",
# "self_reduce_once",
# "self_to_montgomery_form",
# "self_sqr",
# "sqr",
]

# read the measuremens of the basic field operations
with open(PREFIX/OPS_BENCH, "r") as read_file:
read_result = json.load(read_file)
for bench in read_result["benchmarks"]:
if bench["name"] in to_keep:
ns_per_op[bench["name"]] = bench["real_time"]

with open(PREFIX/GOBLIN_BENCH_JSON, "r") as read_file:
read_result = json.load(read_file)
for bench in read_result["benchmarks"]:
if bench["name"] == BENCHMARK:
mct = bench

total_time = 0

for (key, time) in ns_per_op.items():
full_key = "fr::" + key
if (full_key in mct.keys()):
count = int(mct[full_key])
if (count is not None):
print(f'aggregating { count } counts of {key} at time {ns_per_op[key]} ns.')
total_time += count * ns_per_op[key]

total_time /= 1e9

print(f'Time spent on field ops: {round(total_time, 3)}s.')
6 changes: 5 additions & 1 deletion barretenberg/cpp/src/barretenberg/common/op_count.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ template <OperationLabel Op> struct GlobalOpCount {
}
ensure_stats();
stats->cycles += cycles;
#else
static_cast<void>(cycles);
#endif
}
static constexpr void add_clock_time(std::size_t time)
Expand All @@ -111,6 +113,8 @@ template <OperationLabel Op> struct GlobalOpCount {
}
ensure_stats();
stats->time += time;
#else
static_cast<void>(time);
#endif
}
};
Expand Down Expand Up @@ -143,4 +147,4 @@ struct OpCountTimeReporter {
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME() \
bb::detail::OpCountTimeReporter __bb_op_count_time(bb::detail::GlobalOpCount<__func__>::ensure_stats())
#endif
#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#include "fr.hpp"

#include <benchmark/benchmark.h>

using namespace bb;
using namespace benchmark;

namespace {
void asm_add_with_coarse_reduction(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
DoNotOptimize(fr::asm_add_with_coarse_reduction(x, y));
}
}
BENCHMARK(asm_add_with_coarse_reduction);

void asm_conditional_negate(State& state) noexcept
{
fr x;
for (auto _ : state) {
fr::asm_conditional_negate(x, true);
}
}
BENCHMARK(asm_conditional_negate);

void asm_mul_with_coarse_reduction(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
DoNotOptimize(fr::asm_mul_with_coarse_reduction(x, y));
}
}
BENCHMARK(asm_mul_with_coarse_reduction);

void asm_reduce_once(State& state) noexcept
{
fr x;
for (auto _ : state) {
DoNotOptimize(fr::asm_reduce_once(x));
}
}
BENCHMARK(asm_reduce_once);

void asm_self_add_with_coarse_reduction(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
fr::asm_self_add_with_coarse_reduction(x, y);
}
}
BENCHMARK(asm_self_add_with_coarse_reduction);

void asm_self_mul_with_coarse_reduction(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
fr::asm_self_mul_with_coarse_reduction(x, y);
}
}
BENCHMARK(asm_self_mul_with_coarse_reduction);

void asm_self_reduce_once(State& state) noexcept
{
fr x;
for (auto _ : state) {
fr::asm_self_reduce_once(x);
}
}
BENCHMARK(asm_self_reduce_once);

void asm_self_sqr_with_coarse_reduction(State& state) noexcept
{
fr x;
for (auto _ : state) {
fr::asm_self_sqr_with_coarse_reduction(x);
}
}
BENCHMARK(asm_self_sqr_with_coarse_reduction);

void asm_self_sub_with_coarse_reduction(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
fr::asm_self_sub_with_coarse_reduction(x, y);
}
}
BENCHMARK(asm_self_sub_with_coarse_reduction);

void asm_sqr_with_coarse_reduction(State& state) noexcept
{
fr x;
for (auto _ : state) {
DoNotOptimize(fr::asm_sqr_with_coarse_reduction(x));
}
}
BENCHMARK(asm_sqr_with_coarse_reduction);

void mul(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
DoNotOptimize(x * y);
}
}
BENCHMARK(mul);

void self_mul(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
x *= y;
}
}
BENCHMARK(self_mul);

void add(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
DoNotOptimize(x + y);
}
}
BENCHMARK(add);

void self_add(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
x += y;
}
}
BENCHMARK(self_add);

void sub(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
DoNotOptimize(x - y);
}
}
BENCHMARK(sub);

void self_sub(State& state) noexcept
{
fr x, y;
for (auto _ : state) {
x -= y;
}
}
BENCHMARK(self_sub);

void invert(State& state) noexcept
{
fr x;
for (auto _ : state) {
DoNotOptimize(x.invert());
}
}
BENCHMARK(invert);

void self_neg(State& state) noexcept
{
fr x;
for (auto _ : state) {
x.self_neg();
}
}
BENCHMARK(self_neg);

void self_reduce_once(State& state) noexcept
{
fr x;
for (auto _ : state) {
x.self_reduce_once();
}
}
BENCHMARK(self_reduce_once);

void self_to_montgomery_form(State& state) noexcept
{
fr x;
for (auto _ : state) {
x.self_to_montgomery_form();
}
}
BENCHMARK(self_to_montgomery_form);

void self_sqr(State& state) noexcept
{
fr x;
for (auto _ : state) {
x.self_sqr();
}
}
BENCHMARK(self_sqr);

void sqr(State& state) noexcept
{
fr x;
for (auto _ : state) {
DoNotOptimize(x.sqr());
}
}
BENCHMARK(sqr);
} // namespace

// NOLINTNEXTLINE macro invokation triggers style guideline errors from googletest code
BENCHMARK_MAIN();
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,6 @@ template <class Params_> struct alignas(32) field {
void msgpack_unpack(auto o);
void msgpack_schema(auto& packer) const { packer.pack_alias(Params::schema_name, "bin32"); }

private:
static constexpr uint256_t twice_modulus = modulus + modulus;
static constexpr uint256_t not_modulus = -modulus;
static constexpr uint256_t twice_not_modulus = -twice_modulus;
Expand Down
Loading

0 comments on commit 5ddfa16

Please sign in to comment.