From 34a2e20f128a29a28d24d9f65508da2b3b3eb02e Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Fri, 9 Jun 2023 13:30:27 +0200 Subject: [PATCH 1/6] add benchmarks for BN256 --- Cargo.toml | 4 ++++ benches/bn256_field.rs | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 benches/bn256_field.rs diff --git a/Cargo.toml b/Cargo.toml index e203e6c1..226483e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,3 +56,7 @@ codegen-units = 1 [[bench]] name = "less_than" harness = false + +[[bench]] +name = "bn256_field" +harness = false diff --git a/benches/bn256_field.rs b/benches/bn256_field.rs new file mode 100644 index 00000000..627c8501 --- /dev/null +++ b/benches/bn256_field.rs @@ -0,0 +1,34 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use halo2curves::bn256::*; +use halo2curves::ff::Field; +use rand::SeedableRng; +use rand_xorshift::XorShiftRng; + +pub fn bench_bn256_field(c: &mut Criterion) { + let mut rng = XorShiftRng::from_seed([ + 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, + 0xe5, + ]); + + let a = Fq::random(&mut rng); + let b = Fq::random(&mut rng); + + #[cfg(not(feature = "asm"))] + let mut group = c.benchmark_group("BN256 Field Arithmetic (no assembly)"); + + #[cfg(feature = "asm")] + let mut group = c.benchmark_group("BN256 Field Arithmetic (with assembly)"); + + group.significance_level(0.1).sample_size(10000); + group.throughput(Throughput::Elements(1)); + + group.bench_function("bn256_fq_add", |bencher| bencher.iter(|| black_box(&a).add(black_box(&b)))); + group.bench_function("bn256_fq_double", |bencher| bencher.iter(|| black_box(&a).double())); + group.bench_function("bn256_fq_sub", |bencher| bencher.iter(|| black_box(&a).sub(black_box(&b)))); + group.bench_function("bn256_fq_neg", |bencher| bencher.iter(|| black_box(&a).neg())); + group.bench_function("bn256_fq_mul", |bencher| bencher.iter(|| black_box(&a).mul(black_box(&b)))); + group.bench_function("bn256_fq_square", |bencher| bencher.iter(|| black_box(&a).square())); +} + +criterion_group!(benches, bench_bn256_field); +criterion_main!(benches); From ef6e7012d69759d8bd402dc53b59cd879fe703ec Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Fri, 9 Jun 2023 18:40:41 +0200 Subject: [PATCH 2/6] small assembly changes: adcx -> adc --- src/bn256/assembly.rs | 327 ++++-------------------------------------- 1 file changed, 24 insertions(+), 303 deletions(-) diff --git a/src/bn256/assembly.rs b/src/bn256/assembly.rs index 1a47655d..426abaa7 100644 --- a/src/bn256/assembly.rs +++ b/src/bn256/assembly.rs @@ -24,9 +24,9 @@ macro_rules! field_arithmetic_asm { // // add a array and b array with carry "add r8, r8", - "adcx r9, r9", - "adcx r10, r10", - "adcx r11, r11", + "adc r9, r9", + "adc r10, r10", + "adc r11, r11", // copy result array to latter registers "mov r12, r8", @@ -65,290 +65,7 @@ macro_rules! field_arithmetic_asm { /// Squares this element. #[inline] pub fn square(&self) -> $field { - let mut r0: u64; - let mut r1: u64; - let mut r2: u64; - let mut r3: u64; - unsafe { - asm!( - // schoolbook multiplication - // * | a0 | a1 | a2 | a3 - // b0 | b0 * a0 | b0 * a1 | b0 * a2 | b0 * a3 - // b1 | b1 * a0 | b1 * a1 | b1 * a2 | b1 * a3 - // b2 | b2 * a0 | b2 * a1 | b2 * a2 | b2 * a3 - // b3 | b3 * a0 | b3 * a1 | b3 * a2 | b3 * a3 - - // load value to registers - "mov r13, qword ptr [{a_ptr} + 0]", - "mov r14, qword ptr [{a_ptr} + 8]", - "mov r15, qword ptr [{a_ptr} + 16]", - - // `a0` - "mov rdx, r13", - - // a0 * b0 - "mulx r9, r8, r13", - - // a0 * b1 - "mulx r10, rax, r14", - "add r9, rax", - - // a0 * b2 - "mulx r11, rax, r15", - "adcx r10, rax", - - // a0 * b3 - "mulx r12, rax, qword ptr [{a_ptr} + 24]", - "adcx r11, rax", - "adc r12, 0", - - // `a1` - "mov rdx, r14", - - // a1 * b0 - "mulx rcx, rax, r13", - "add r9, rax", - "adcx r10, rcx", - "adc r11, 0", - - // a1 * b1 - "mulx rcx, rax, r14", - "add r10, rax", - "adcx r11, rcx", - "adc r12, 0", - "xor r13, r13", - - // a1 * b2 - "mulx rcx, rax, r15", - "add r11, rax", - "adcx r12, rcx", - "adc r13, 0", - "xor r14, r14", - - // a1 * b3 - "mulx rcx, rax, qword ptr [{a_ptr} + 24]", - "add r12, rax", - "adcx r13, rcx", - "adc r14, 0", - - // `a2` - "mov rdx, r15", - - // a2 * b0 - "mulx rcx, rax, qword ptr [{a_ptr} + 0]", - "add r10, rax", - "adcx r11, rcx", - "adc r12, 0", - - // a2 * b1 - "mulx rcx, rax, qword ptr [{a_ptr} + 8]", - "add r11, rax", - "adcx r12, rcx", - "adc r13, 0", - - // a2 * b2 - "mulx rcx, rax, r15", - "add r12, rax", - "adcx r13, rcx", - "adc r14, 0", - "xor r15, r15", - - // a2 * b3 - "mulx rcx, rax, qword ptr [{a_ptr} + 24]", - "add r13, rax", - "adcx r14, rcx", - "adc r15, 0", - - // `a3` - "mov rdx, qword ptr [{a_ptr} + 24]", - - // a3 * b0 - "mulx rcx, rax, qword ptr [{a_ptr} + 0]", - "add r11, rax", - "adcx r12, rcx", - "adc r13, 0", - - // a3 * b1 - "mulx rcx, rax, qword ptr [{a_ptr} + 8]", - "add r12, rax", - "adcx r13, rcx", - "adc r14, 0", - - // a3 * b2 - "mulx rcx, rax, qword ptr [{a_ptr} + 16]", - "add r13, rax", - "adcx r14, rcx", - "adc r15, 0", - - // a3 * b3 - "mulx rcx, rax, qword ptr [{a_ptr} + 24]", - "add r14, rax", - "adc r15, rcx", - - // montgomery reduction - // r8 ~ r15 - - // `r8` -> 0 - "mov rdx, {inv}", - "mulx rax, rdx, r8", - - // r8' * m0 - "mulx rcx, rax, qword ptr [{m_ptr} + 0]", - "add r8, rax", - "adcx r9, rcx", - "adc r10, 0", - - // r8' * m1 - "mulx rcx, rax, qword ptr [{m_ptr} + 8]", - "add r9, rax", - "adcx r10, rcx", - "adc r11, 0", - - // r8' * m2 - "mulx rcx, rax, qword ptr [{m_ptr} + 16]", - "add r10, rax", - "adcx r11, rcx", - "adc r12, 0", - - // r8' * m3 - "mulx rcx, rax, qword ptr [{m_ptr} + 24]", - "add r11, rax", - "adcx r12, rcx", - "adc r13, 0", - - // `r9` -> 0 - "mov rdx, {inv}", - "mulx rax, rdx, r9", - - // r9' * m0 - "mulx rax, rcx, qword ptr [{m_ptr} + 0]", - "add r9, rcx", - "adcx r10, rax", - "adc r11, 0", - - // r9' * m1 - "mulx rax, rcx, qword ptr [{m_ptr} + 8]", - "add r10, rcx", - "adcx r11, rax", - "adc r12, 0", - - // r9' * m2 - "mulx rax, rcx, qword ptr [{m_ptr} + 16]", - "add r11, rcx", - "adcx r12, rax", - "adc r13, 0", - - // r9' * m3 - "mulx rax, rcx, qword ptr [{m_ptr} + 24]", - "add r12, rcx", - "adcx r13, rax", - "adc r14, 0", - - // `r10` -> 0 - "mov rdx, {inv}", - "mulx rax, rdx, r10", - - // r10' * m0 - "mulx rax, rcx, qword ptr [{m_ptr} + 0]", - "add r10, rcx", - "adcx r11, rax", - "adc r12, 0", - - // r10' * m1 - "mulx rax, rcx, qword ptr [{m_ptr} + 8]", - "add r11, rcx", - "adcx r12, rax", - "adc r13, 0", - - // r10' * m2 - "mulx rax, rcx, qword ptr [{m_ptr} + 16]", - "add r12, rcx", - "adcx r13, rax", - "adc r14, 0", - - // r10' * m3 - "mulx rax, rcx, qword ptr [{m_ptr} + 24]", - "add r13, rcx", - "adcx r14, rax", - "adc r15, 0", - - // `r11` -> 0 - "mov rdx, {inv}", - "mulx rax, rdx, r11", - - // r11' * m0 - "mulx rax, rcx, qword ptr [{m_ptr} + 0]", - "add r11, rcx", - "adcx r12, rax", - "adc r13, 0", - - // r11' * m1 - "mulx rax, rcx, qword ptr [{m_ptr} + 8]", - "add r12, rcx", - "adcx r13, rax", - "adc r14, 0", - - // r11' * m2 - "mulx rax, rcx, qword ptr [{m_ptr} + 16]", - "add r13, rcx", - "adcx r14, rax", - "adc r15, 0", - - // r11' * m3 - "mulx rax, rcx, qword ptr [{m_ptr} + 24]", - "add r14, rcx", - "adcx r15, rax", - - // reduction if limbs is greater then mod - "mov r8, r12", - "mov r9, r13", - "mov r10, r14", - "mov r11, r15", - - "sub r8, qword ptr [{m_ptr} + 0]", - "sbb r9, qword ptr [{m_ptr} + 8]", - "sbb r10, qword ptr [{m_ptr} + 16]", - "sbb r11, qword ptr [{m_ptr} + 24]", - - "cmovc r8, r12", - "cmovc r9, r13", - "cmovc r10, r14", - "cmovc r11, r15", - - "mov r12, r8", - "mov r13, r9", - "mov r14, r10", - "mov r15, r11", - - "sub r12, qword ptr [{m_ptr} + 0]", - "sbb r13, qword ptr [{m_ptr} + 8]", - "sbb r14, qword ptr [{m_ptr} + 16]", - "sbb r15, qword ptr [{m_ptr} + 24]", - - "cmovc r12, r8", - "cmovc r13, r9", - "cmovc r14, r10", - "cmovc r15, r11", - - a_ptr = in(reg) self.0.as_ptr(), - m_ptr = in(reg) $modulus.0.as_ptr(), - inv = const $inv, - out("rax") _, - out("rcx") _, - out("rdx") _, - out("r8") _, - out("r9") _, - out("r10") _, - out("r11") _, - out("r12") r0, - out("r13") r1, - out("r14") r2, - out("r15") r3, - options(pure, readonly, nostack) - ) - } - - $field([r0, r1, r2, r3]) + self.mul(self) } #[inline(always)] @@ -836,10 +553,10 @@ macro_rules! field_arithmetic_asm { unsafe { asm!( // init modulus area - "xor r12, r12", - "xor r13, r13", - "xor r14, r14", - "xor r15, r15", + "mov r12, qword ptr [{m_ptr} + 0]", + "mov r13, qword ptr [{m_ptr} + 8]", + "mov r14, qword ptr [{m_ptr} + 16]", + "mov r15, qword ptr [{m_ptr} + 24]", // load a array to former registers "mov r8, qword ptr [{a_ptr} + 0]", @@ -853,21 +570,25 @@ macro_rules! field_arithmetic_asm { "sbb r10, qword ptr [{b_ptr} + 16]", "sbb r11, qword ptr [{b_ptr} + 24]", - // if carry copy modulus - "cmovc r12, qword ptr [{m_ptr} + 0]", - "cmovc r13, qword ptr [{m_ptr} + 8]", - "cmovc r14, qword ptr [{m_ptr} + 16]", - "cmovc r15, qword ptr [{m_ptr} + 24]", + // Mask: rax contains 0xFFFF if < m or 0x0000 otherwise + "sbb rax, rax", - // mod addition + // Zero-out the modulus if a-b < m or leave as-is otherwise + "and r12, rax", + "and r13, rax", + "and r14, rax", + "and r15, rax", + + // Add zero if a-b < m or a-b+m otherwise "add r12, r8", - "adcx r13, r9", - "adcx r14, r10", - "adcx r15, r11", + "adc r13, r9", + "adc r14, r10", + "adc r15, r11", m_ptr = in(reg) $modulus.0.as_ptr(), a_ptr = in(reg) self.0.as_ptr(), b_ptr = in(reg) rhs.0.as_ptr(), + out("rax") _, out("r8") _, out("r9") _, out("r10") _, @@ -899,9 +620,9 @@ macro_rules! field_arithmetic_asm { // add a array and b array with carry "add r8, qword ptr [{b_ptr} + 0]", - "adcx r9, qword ptr [{b_ptr} + 8]", - "adcx r10, qword ptr [{b_ptr} + 16]", - "adcx r11, qword ptr [{b_ptr} + 24]", + "adc r9, qword ptr [{b_ptr} + 8]", + "adc r10, qword ptr [{b_ptr} + 16]", + "adc r11, qword ptr [{b_ptr} + 24]", // copy result array to latter registers "mov r12, r8", From 90acaf970580cb3519a61a3da46cf877d7cecb48 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Fri, 9 Jun 2023 19:01:00 +0200 Subject: [PATCH 3/6] rework mul assembly to use 2 carry chains --- src/bn256/assembly.rs | 483 +++++++++++++++++++----------------------- 1 file changed, 220 insertions(+), 263 deletions(-) diff --git a/src/bn256/assembly.rs b/src/bn256/assembly.rs index 426abaa7..9ee13ee1 100644 --- a/src/bn256/assembly.rs +++ b/src/bn256/assembly.rs @@ -262,280 +262,237 @@ macro_rules! field_arithmetic_asm { let mut r3: u64; unsafe { asm!( - // schoolbook multiplication - // * | a0 | a1 | a2 | a3 - // b0 | b0 * a0 | b0 * a1 | b0 * a2 | b0 * a3 - // b1 | b1 * a0 | b1 * a1 | b1 * a2 | b1 * a3 - // b2 | b2 * a0 | b2 * a1 | b2 * a2 | b2 * a3 - // b3 | b3 * a0 | b3 * a1 | b3 * a2 | b3 * a3 - - // load value to registers - "mov r13, qword ptr [{b_ptr} + 0]", - "mov r14, qword ptr [{b_ptr} + 8]", - "mov r15, qword ptr [{b_ptr} + 16]", - - // `a0` - "mov rdx, qword ptr [{a_ptr} + 0]", - - // a0 * b0 - "mulx r9, r8, r13", - - // a0 * b1 - "mulx r10, rax, r14", - "add r9, rax", - - // a0 * b2 - "mulx r11, rax, r15", - "adcx r10, rax", - - // a0 * b3 - "mulx r12, rax, qword ptr [{b_ptr} + 24]", - "adcx r11, rax", - "adc r12, 0", - - // `a1` - "mov rdx, [{a_ptr} + 8]", - - // a1 * b0 - "mulx rcx, rax, r13", - "add r9, rax", - "adcx r10, rcx", - "adc r11, 0", - - // a1 * b1 - "mulx rcx, rax, r14", - "add r10, rax", - "adcx r11, rcx", - "adc r12, 0", - "xor r13, r13", - - // a1 * b2 - "mulx rcx, rax, r15", - "add r11, rax", - "adcx r12, rcx", - "adc r13, 0", - "xor r14, r14", - - // a1 * b3 - "mulx rcx, rax, qword ptr [{b_ptr} + 24]", - "add r12, rax", - "adcx r13, rcx", - "adc r14, 0", - - // `a2` - "mov rdx, [{a_ptr} + 16]", - - // a2 * b0 - "mulx rcx, rax, qword ptr [{b_ptr} + 0]", - "add r10, rax", - "adcx r11, rcx", - "adc r12, 0", - - // a2 * b1 - "mulx rcx, rax, qword ptr [{b_ptr} + 8]", - "add r11, rax", - "adcx r12, rcx", - "adc r13, 0", - - // a2 * b2 - "mulx rcx, rax, r15", - "add r12, rax", - "adcx r13, rcx", - "adc r14, 0", - "xor r15, r15", - - // a2 * b3 - "mulx rcx, rax, qword ptr [{b_ptr} + 24]", - "add r13, rax", - "adcx r14, rcx", - "adc r15, 0", - - // `a3` - "mov rdx, [{a_ptr} + 24]", - - // a3 * b0 - "mulx rcx, rax, qword ptr [{b_ptr} + 0]", - "add r11, rax", - "adcx r12, rcx", - "adc r13, 0", - - // a3 * b1 - "mulx rcx, rax, qword ptr [{b_ptr} + 8]", - "add r12, rax", - "adcx r13, rcx", - "adc r14, 0", - - // a3 * b2 - "mulx rcx, rax, qword ptr [{b_ptr} + 16]", - "add r13, rax", - "adcx r14, rcx", - "adc r15, 0", - - // a3 * b3 - "mulx rcx, rax, qword ptr [{b_ptr} + 24]", - "add r14, rax", - "adc r15, rcx", - - // montgomery reduction - // r8 ~ r15 - - // `r8` -> 0 - "mov rdx, {inv}", - "mulx rax, rdx, r8", - // r8' * m0 - "mulx rcx, rax, qword ptr [{m_ptr} + 0]", - "add r8, rax", - "adcx r9, rcx", - "adc r10, 0", - - // r8' * m1 - "mulx rcx, rax, qword ptr [{m_ptr} + 8]", - "add r9, rax", - "adcx r10, rcx", - "adc r11, 0", - - // // r8' * m2 - "mulx rcx, rax, qword ptr [{m_ptr} + 16]", - "add r10, rax", - "adcx r11, rcx", - "adc r12, 0", - - // // r8' * m3 - "mulx rcx, rax, qword ptr [{m_ptr} + 24]", - "add r11, rax", - "adcx r12, rcx", - "adc r13, 0", - - // `r9` -> 0 - "mov rdx, {inv}", - "mulx rax, rdx, r9", - - // r9' * m0 - "mulx rax, rcx, qword ptr [{m_ptr} + 0]", - "add r9, rcx", - "adcx r10, rax", - "adc r11, 0", - - // r9' * m1 - "mulx rax, rcx, qword ptr [{m_ptr} + 8]", - "add r10, rcx", - "adcx r11, rax", - "adc r12, 0", - - // r9' * m2 - "mulx rax, rcx, qword ptr [{m_ptr} + 16]", - "add r11, rcx", - "adcx r12, rax", - "adc r13, 0", - - // r9' * m3 - "mulx rax, rcx, qword ptr [{m_ptr} + 24]", - "add r12, rcx", + // Coarsely Integrated Operand Scanning: + // - Analyzing and Comparing Montgomery Multiplication Algorithms + // Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr. + // http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf + // + // No-carry optimization + // - https://hackmd.io/@gnark/modular_multiplication + // + // Code generator + // - https://github.com/mratsim/constantine/blob/151f284/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim#L231-L269 + // + // Assembly generated + // - https://github.com/ethereum/evmone/blob/d006d81/lib/evmmax/mulMont256_spare_bits_asm_adx.S + + // Algorithm + // ----------------------------------------- + // for i=0 to N-1 + // for j=0 to N-1 + // (A,t[j]) := t[j] + a[j]*b[i] + A + // m := t[0]*m0ninv mod W + // C,_ := t[0] + m*M[0] + // for j=1 to N-1 + // (C,t[j-1]) := t[j] + m*M[j] + C + // t[N-1] = C + A + + // Outer loop i = 0 + // Multiplication + "mov rdx, qword ptr [{b_ptr} + 0]", + "mulx r13, r11, qword ptr [{a_ptr} + 0]", + "mulx rax, r15, qword ptr [{a_ptr} + 8]", + "add r13, r15", + "mulx r14, r15, qword ptr [{a_ptr} + 16]", + "adc rax, r15", + // Multiplication. last limb + "mulx r12, r15, qword ptr [{a_ptr} + 24]", + "adc r14, r15", + "adc r12, 0", // accumulate last carries in hi word + + // Reduction + // m = t[0] * m0ninv mod 2^w + "mov rdx, r11", + "imul rdx, {inv}", + "xor r15, r15", + // C,_ := t[0] + m*M[0] + "mulx r15, r10, qword ptr [{m_ptr} + 0]", + "adcx r10, r11", + "mov r11, r15", + "mov r10, 0", + // for j=1 to N-1 + // (C, t[j-1]) := t[j] + m*M[j] + C + "adcx r11, r13", + "mulx r13, r15, qword ptr [{m_ptr} + 8]", + "adox r11, r15", "adcx r13, rax", - "adc r14, 0", - - // `r10` -> 0 - "mov rdx, {inv}", - "mulx rax, rdx, r10", - - // r10' * m0 - "mulx rax, rcx, qword ptr [{m_ptr} + 0]", - "add r10, rcx", - "adcx r11, rax", - "adc r12, 0", - - // r10' * m1 - "mulx rax, rcx, qword ptr [{m_ptr} + 8]", - "add r11, rcx", - "adcx r12, rax", - "adc r13, 0", - - // r10' * m2 - "mulx rax, rcx, qword ptr [{m_ptr} + 16]", - "add r12, rcx", + "mulx rax, r15, qword ptr [{m_ptr} + 16]", + "adox r13, r15", + "adcx rax, r14", + "mulx r14, r15, qword ptr [{m_ptr} + 24]", + "adox rax, r15", + // Reduction carry + "adcx r10, r12", + "adox r14, r10", + + // Outer loop i = 1, j in [0, 4) + "mov rdx, qword ptr [{b_ptr} + 8]", + "xor r12, r12", + "mulx r12, r15, qword ptr [{a_ptr} + 0]", + "adox r11, r15", + "adcx r13, r12", + "mulx r12, r15, qword ptr [{a_ptr} + 8]", + "adox r13, r15", + "adcx rax, r12", + "mulx r12, r15, qword ptr [{a_ptr} + 16]", + "adox rax, r15", + "adcx r14, r12", + // Multiplication, last limb + "mulx r12, r15, qword ptr [{a_ptr} + 24]", + "adox r14, r15", + "mov rdx, 0", // accumulate last carries in hi word + "adcx r12, rdx", + "adox r12, rdx", + + // Reduction + // m = t[0] * m0ninv mod 2^w + "mov rdx, r11", + "imul rdx, {inv}", + "xor r15, r15", + // C,_ := t[0] + m*M[0] + "mulx r15, r10, qword ptr [{m_ptr} + 0]", + "adcx r10, r11", + "mov r11, r15", + "mov r10, 0", + // for j=1 to N-1 + // (C, t[j-1]) := t[j] + m*M[j] + C + "adcx r11, r13", + "mulx r13, r15, qword ptr [{m_ptr} + 8]", + "adox r11, r15", "adcx r13, rax", - "adc r14, 0", - - // r10' * m3 - "mulx rax, rcx, qword ptr [{m_ptr} + 24]", - "add r13, rcx", - "adcx r14, rax", - "adc r15, 0", - - // `r11` -> 0 - "mov rdx, {inv}", - "mulx rax, rdx, r11", - - // r11' * m0 - "mulx rax, rcx, qword ptr [{m_ptr} + 0]", - "add r11, rcx", - "adcx r12, rax", - "adc r13, 0", - - // r11' * m1 - "mulx rax, rcx, qword ptr [{m_ptr} + 8]", - "add r12, rcx", + "mulx rax, r15, qword ptr [{m_ptr} + 16]", + "adox r13, r15", + "adcx rax, r14", + "mulx r14, r15, qword ptr [{m_ptr} + 24]", + "adox rax, r15", + // Reduction carry + "adcx r10, r12", + "adox r14, r10", + + // Outer loop i = 2, j in [0, 4) + "mov rdx, qword ptr [{b_ptr} + 16]", + "xor r12, r12", + "mulx r12, r15, qword ptr [{a_ptr} + 0]", + "adox r11, r15", + "adcx r13, r12", + "mulx r12, r15, qword ptr [{a_ptr} + 8]", + "adox r13, r15", + "adcx rax, r12", + "mulx r12, r15, qword ptr [{a_ptr} + 16]", + "adox rax, r15", + "adcx r14, r12", + // Multiplication, last limb + "mulx r12, r15, qword ptr [{a_ptr} + 24]", + "adox r14, r15", + "mov rdx, 0", // accumulate last carries in hi word + "adcx r12, rdx", + "adox r12, rdx", + + // Reduction + // m = t[0] * m0ninv mod 2^w + "mov rdx, r11", + "imul rdx, {inv}", + "xor r15, r15", + // C,_ := t[0] + m*M[0] + "mulx r15, r10, qword ptr [{m_ptr} + 0]", + "adcx r10, r11", + "mov r11, r15", + "mov r10, 0", + // for j=1 to N-1 + // (C, t[j-1]) := t[j] + m*M[j] + C + "adcx r11, r13", + "mulx r13, r15, qword ptr [{m_ptr} + 8]", + "adox r11, r15", "adcx r13, rax", - "adc r14, 0", - - // r11' * m2 - "mulx rax, rcx, qword ptr [{m_ptr} + 16]", - "add r13, rcx", - "adcx r14, rax", - "adc r15, 0", - - // r11' * m3 - "mulx rax, rcx, qword ptr [{m_ptr} + 24]", - "add r14, rcx", - "adcx r15, rax", - - // reduction if limbs is greater then mod - "mov r8, r12", - "mov r9, r13", - "mov r10, r14", - "mov r11, r15", - - "sub r8, qword ptr [{m_ptr} + 0]", - "sbb r9, qword ptr [{m_ptr} + 8]", - "sbb r10, qword ptr [{m_ptr} + 16]", - "sbb r11, qword ptr [{m_ptr} + 24]", - - "cmovc r8, r12", - "cmovc r9, r13", - "cmovc r10, r14", - "cmovc r11, r15", - - "mov r12, r8", - "mov r13, r9", - "mov r14, r10", - "mov r15, r11", - - "sub r12, qword ptr [{m_ptr} + 0]", - "sbb r13, qword ptr [{m_ptr} + 8]", - "sbb r14, qword ptr [{m_ptr} + 16]", - "sbb r15, qword ptr [{m_ptr} + 24]", - - "cmovc r12, r8", - "cmovc r13, r9", - "cmovc r14, r10", - "cmovc r15, r11", + "mulx rax, r15, qword ptr [{m_ptr} + 16]", + "adox r13, r15", + "adcx rax, r14", + "mulx r14, r15, qword ptr [{m_ptr} + 24]", + "adox rax, r15", + // Reduction carry + "adcx r10, r12", + "adox r14, r10", + + // Outer loop i = 3, j in [0, 4) + "mov rdx, qword ptr [{b_ptr} + 24]", + "xor r12, r12", + "mulx r12, r15, qword ptr [{a_ptr} + 0]", + "adox r11, r15", + "adcx r13, r12", + "mulx r12, r15, qword ptr [{a_ptr} + 8]", + "adox r13, r15", + "adcx rax, r12", + "mulx r12, r15, qword ptr [{a_ptr} + 16]", + "adox rax, r15", + "adcx r14, r12", + // Multiplication, last limb + "mulx r12, r15, qword ptr [{a_ptr} + 24]", + "adox r14, r15", + "mov rdx, 0", // accumulate last carries in hi word + "adcx r12, rdx", + "adox r12, rdx", + + // Reduction + // m = t[0] * m0ninv mod 2^w + "mov rdx, r11", + "imul rdx, {inv}", + "xor r15, r15", + // C,_ := t[0] + m*M[0] + "mulx r15, r10, qword ptr [{m_ptr} + 0]", + "adcx r10, r11", + "mov r11, r15", + "mov r10, 0", + // for j=1 to N-1 + // (C, t[j-1]) := t[j] + m*M[j] + C + "adcx r11, r13", + "mulx r13, r15, qword ptr [{m_ptr} + 8]", + "adox r11, r15", + "adcx r13, rax", + "mulx rax, r15, qword ptr [{m_ptr} + 16]", + "adox r13, r15", + "adcx rax, r14", + "mulx r14, r15, qword ptr [{m_ptr} + 24]", + "adox rax, r15", + // Reduction carry + "adcx r10, r12", + "adox r14, r10", + + // Final substraction + "mov r12, r11", + "sub r12, qword ptr [{m_ptr} + 0]", + "mov r10, r13", + "sbb r10, qword ptr [{m_ptr} + 8]", + "mov rdx, rax", + "sbb rdx, qword ptr [{m_ptr} + 16]", + "mov r15, r14", + "sbb r15, qword ptr [{m_ptr} + 24]", + + "cmovnc r11, r12", + "cmovnc r13, r10", + "cmovnc rax, rdx", + "cmovnc r14, r15", m_ptr = in(reg) $modulus.0.as_ptr(), a_ptr = in(reg) self.0.as_ptr(), b_ptr = in(reg) rhs.0.as_ptr(), - inv = const $inv, - out("rax") _, - out("rcx") _, + inv = in(reg) $inv, + out("rax") r2, + // out("rcx") _, out("rdx") _, - out("r8") _, - out("r9") _, + // out("rbx") _, + // out("rsp") _, + // out("rbp") _, + // out("rsi") _, + // out("rdi") _, + // out("r8") _, + // out("r9") _, out("r10") _, - out("r11") _, - out("r12") r0, + out("r11") r0, + out("r12") _, out("r13") r1, - out("r14") r2, - out("r15") r3, + out("r14") r3, + out("r15") _, options(pure, readonly, nostack) ) } From d24c558768f539384119927b946d355e47f253c9 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Tue, 13 Jun 2023 10:49:33 +0200 Subject: [PATCH 4/6] remove need for nightly for asm by using register instead of constant --- src/bn256/assembly.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bn256/assembly.rs b/src/bn256/assembly.rs index 9ee13ee1..8c01ced3 100644 --- a/src/bn256/assembly.rs +++ b/src/bn256/assembly.rs @@ -234,7 +234,7 @@ macro_rules! field_arithmetic_asm { a_ptr = in(reg) a.as_ptr(), m_ptr = in(reg) $modulus.0.as_ptr(), - inv = const $inv, + inv = in(reg) $inv, out("rax") _, out("rcx") _, out("rdx") _, From c5b37377fa82c4c166925e3802efc5bf752cbf93 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Tue, 13 Jun 2023 10:51:02 +0200 Subject: [PATCH 5/6] remove unused regs --- src/bn256/assembly.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/bn256/assembly.rs b/src/bn256/assembly.rs index 8c01ced3..43b1170a 100644 --- a/src/bn256/assembly.rs +++ b/src/bn256/assembly.rs @@ -478,15 +478,7 @@ macro_rules! field_arithmetic_asm { b_ptr = in(reg) rhs.0.as_ptr(), inv = in(reg) $inv, out("rax") r2, - // out("rcx") _, out("rdx") _, - // out("rbx") _, - // out("rsp") _, - // out("rbp") _, - // out("rsi") _, - // out("rdi") _, - // out("r8") _, - // out("r9") _, out("r10") _, out("r11") r0, out("r12") _, From fd1bc85b8d80f0f401f843c8934e54d2b955562c Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Tue, 13 Jun 2023 10:52:34 +0200 Subject: [PATCH 6/6] run cargo fmt --- benches/bn256_field.rs | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/benches/bn256_field.rs b/benches/bn256_field.rs index 627c8501..8a17aef3 100644 --- a/benches/bn256_field.rs +++ b/benches/bn256_field.rs @@ -22,12 +22,24 @@ pub fn bench_bn256_field(c: &mut Criterion) { group.significance_level(0.1).sample_size(10000); group.throughput(Throughput::Elements(1)); - group.bench_function("bn256_fq_add", |bencher| bencher.iter(|| black_box(&a).add(black_box(&b)))); - group.bench_function("bn256_fq_double", |bencher| bencher.iter(|| black_box(&a).double())); - group.bench_function("bn256_fq_sub", |bencher| bencher.iter(|| black_box(&a).sub(black_box(&b)))); - group.bench_function("bn256_fq_neg", |bencher| bencher.iter(|| black_box(&a).neg())); - group.bench_function("bn256_fq_mul", |bencher| bencher.iter(|| black_box(&a).mul(black_box(&b)))); - group.bench_function("bn256_fq_square", |bencher| bencher.iter(|| black_box(&a).square())); + group.bench_function("bn256_fq_add", |bencher| { + bencher.iter(|| black_box(&a).add(black_box(&b))) + }); + group.bench_function("bn256_fq_double", |bencher| { + bencher.iter(|| black_box(&a).double()) + }); + group.bench_function("bn256_fq_sub", |bencher| { + bencher.iter(|| black_box(&a).sub(black_box(&b))) + }); + group.bench_function("bn256_fq_neg", |bencher| { + bencher.iter(|| black_box(&a).neg()) + }); + group.bench_function("bn256_fq_mul", |bencher| { + bencher.iter(|| black_box(&a).mul(black_box(&b))) + }); + group.bench_function("bn256_fq_square", |bencher| { + bencher.iter(|| black_box(&a).square()) + }); } criterion_group!(benches, bench_bn256_field);